From 31a03aa2bc5a51439d970ca94e788b4006d8e08b Mon Sep 17 00:00:00 2001
From: Andreas Marek <amarek@rzg.mpg.de>
Date: Wed, 24 Feb 2016 12:28:04 +0100
Subject: [PATCH] Add migration notice

---
 .gitignore                                    |   21 -
 .gitlab-ci.yml                                |    2 -
 COPYING/COPYING                               |   30 -
 COPYING/gpl.txt                               |  674 ---
 COPYING/lgpl.txt                              |  165 -
 Doxyfile.in                                   | 2442 --------
 INSTALL                                       |  227 -
 INSTALL_FROM_GIT_VERSION                      |   19 -
 LIBRARY_INTERFACE                             |   34 -
 Makefile.am                                   |  383 --
 README                                        |   89 -
 RELEASE_NOTES                                 |   32 -
 THIS_REPO_HAS_MOVED                           |   14 +
 autogen.sh                                    |    8 -
 configure.ac                                  |  749 ---
 doxygen.am                                    |  156 -
 elpa.pc.in                                    |   12 -
 elpa/elpa.h                                   |    2 -
 elpa/elpa_kernel_constants.h                  |   21 -
 fdep/LICENSE                                  |   19 -
 fdep/README                                   |   99 -
 fdep/fortran_dependencies.m4                  |   24 -
 fdep/fortran_dependencies.mk                  |  101 -
 fdep/fortran_dependencies.pl                  |   89 -
 fdep/test_project/.gitignore                  |   38 -
 fdep/test_project/Makefile.am                 |   28 -
 fdep/test_project/autogen.sh                  |    8 -
 fdep/test_project/configure.ac                |   37 -
 fdep/test_project/fdep                        |    1 -
 fdep/test_project/src/bar.F90                 |   17 -
 fdep/test_project/src/foo.F90                 |    5 -
 fdep/test_project/src2/baz.F90                |   18 -
 m4/ax_check_gnu_make.m4                       |   78 -
 m4/ax_elpa_gpu_version_only.m4                |   48 -
 m4/ax_elpa_openmp.m4                          |   98 -
 m4/ax_elpa_specific_kernels.m4                |  132 -
 m4/ax_prog_cc_mpi.m4                          |  166 -
 m4/ax_prog_cxx_mpi.m4                         |  173 -
 m4/ax_prog_doxygen.m4                         |  532 --
 m4/ax_prog_fc_mpi.m4                          |  157 -
 man/get_elpa_communicators.3                  |   59 -
 man/get_elpa_row_col_comms.3                  |   61 -
 man/print_available_elpa2_kernels.1           |   27 -
 man/solve_evp_complex.3                       |   51 -
 man/solve_evp_complex_1stage.3                |   88 -
 man/solve_evp_complex_2stage.3                |   91 -
 man/solve_evp_real.3                          |   51 -
 man/solve_evp_real_1stage.3                   |   86 -
 man/solve_evp_real_2stage.3                   |   93 -
 src/elpa1.F90                                 |  476 --
 src/elpa1_compute.F90                         | 4025 -------------
 src/elpa2.F90                                 |  540 --
 src/elpa2_compute.F90                         | 5303 -----------------
 src/elpa2_kernels/README_elpa2_kernels.txt    |  156 -
 src/elpa2_kernels/elpa2_kernels_asm_x86_64.s  |  701 ---
 src/elpa2_kernels/elpa2_kernels_complex.F90   |  888 ---
 .../elpa2_kernels_complex_simple.F90          |  177 -
 .../elpa2_kernels_complex_sse-avx_1hv.cpp     | 1057 ----
 .../elpa2_kernels_complex_sse-avx_2hv.cpp     | 2763 ---------
 src/elpa2_kernels/elpa2_kernels_real.F90      |  662 --
 src/elpa2_kernels/elpa2_kernels_real_bgp.f90  |  799 ---
 src/elpa2_kernels/elpa2_kernels_real_bgq.f90  |  662 --
 .../elpa2_kernels_real_simple.F90             |  136 -
 .../elpa2_kernels_real_sse-avx_2hv.c          | 1718 ------
 .../elpa2_kernels_real_sse-avx_4hv.c          | 2471 --------
 .../elpa2_kernels_real_sse-avx_6hv.c          | 3104 ----------
 .../mod_single_hh_trafo_real.F90              |   67 -
 src/elpa2_utilities.F90                       |  525 --
 src/elpa_c_interface.F90                      |  318 -
 src/elpa_qr/elpa_pdgeqrf.F90                  | 2411 --------
 src/elpa_qr/elpa_pdlarfb.f90                  |  639 --
 src/elpa_qr/elpa_qrkernels.f90                |  783 ---
 src/elpa_qr/qr_utils.f90                      |  396 --
 src/elpa_reduce_add_vectors.X90               |  188 -
 src/elpa_transpose_vectors.X90                |  190 -
 src/elpa_utilities.F90                        |  136 -
 src/ftimings/COPYING.LESSER                   |  165 -
 src/ftimings/ftimings.F90                     | 1472 -----
 src/ftimings/ftimings_type.F90                |   23 -
 src/ftimings/ftimings_value.F90               |   93 -
 src/ftimings/highwater_mark.c                 |   43 -
 src/ftimings/papi.c                           |  164 -
 src/ftimings/resident_set_size.c              |   34 -
 src/ftimings/time.c                           |   40 -
 src/ftimings/virtual_memory.c                 |   35 -
 src/mod_compute_hh_trafo_complex.F90          |  260 -
 src/mod_compute_hh_trafo_real.F90             |  417 --
 src/mod_pack_unpack_complex.F90               |  130 -
 src/mod_pack_unpack_real.F90                  |  139 -
 src/mod_precision.f90                         |    9 -
 src/print_available_elpa2_kernels.F90         |  126 -
 src/redist_band.X90                           |  323 -
 src/timer.F90                                 |    9 -
 .../elpa1_test_complex_c_version.c            |  225 -
 .../elpa1_test_real_c_version.c               |  218 -
 .../elpa2_test_complex_c_version.c            |  229 -
 .../elpa2_test_real_c_version.c               |  223 -
 test/fortran_test_programs/read_real.F90      |  432 --
 test/fortran_test_programs/test_complex.F90   |  332 --
 test/fortran_test_programs/test_complex2.F90  |  360 --
 .../test_complex2_choose_kernel_with_api.F90  |  364 --
 .../test_complex2_default_kernel.F90          |  362 --
 test/fortran_test_programs/test_real.F90      |  338 --
 test/fortran_test_programs/test_real2.F90     |  364 --
 .../test_real2_choose_kernel_with_api.F90     |  357 --
 .../test_real2_default_kernel.F90             |  355 --
 ..._real2_default_kernel_qr_decomposition.F90 |  367 --
 .../test_real_with_c.F90                      |  424 --
 test/shared_sources/blacs_infrastructure.F90  |  148 -
 test/shared_sources/call_elpa1.c              |   59 -
 test/shared_sources/call_elpa2.c              |   68 -
 test/shared_sources/check_correctnes.F90      |  251 -
 test/shared_sources/mod_from_c.F90            |  160 -
 test/shared_sources/prepare_matrix.F90        |  168 -
 test/shared_sources/read_input_parameters.F90 |  103 -
 test/shared_sources/redir.c                   |  125 -
 test/shared_sources/redirect.F90              |  118 -
 test/shared_sources/setup_mpi.F90             |   86 -
 test/shared_sources/util.F90                  |   76 -
 test_project/Makefile.am                      |   10 -
 test_project/autogen.sh                       |    1 -
 test_project/configure.ac                     |   93 -
 test_project/fdep                             |    1 -
 test_project/m4/ax_prog_fc_mpi.m4             |    1 -
 test_project/src/test_real.F90                |    1 -
 125 files changed, 14 insertions(+), 48733 deletions(-)
 delete mode 100644 .gitignore
 delete mode 100644 .gitlab-ci.yml
 delete mode 100644 COPYING/COPYING
 delete mode 100644 COPYING/gpl.txt
 delete mode 100644 COPYING/lgpl.txt
 delete mode 100644 Doxyfile.in
 delete mode 100644 INSTALL
 delete mode 100644 INSTALL_FROM_GIT_VERSION
 delete mode 100644 LIBRARY_INTERFACE
 delete mode 100644 Makefile.am
 delete mode 100644 README
 delete mode 100644 RELEASE_NOTES
 create mode 100644 THIS_REPO_HAS_MOVED
 delete mode 100755 autogen.sh
 delete mode 100644 configure.ac
 delete mode 100644 doxygen.am
 delete mode 100644 elpa.pc.in
 delete mode 100644 elpa/elpa.h
 delete mode 100644 elpa/elpa_kernel_constants.h
 delete mode 100644 fdep/LICENSE
 delete mode 100644 fdep/README
 delete mode 100644 fdep/fortran_dependencies.m4
 delete mode 100644 fdep/fortran_dependencies.mk
 delete mode 100755 fdep/fortran_dependencies.pl
 delete mode 100644 fdep/test_project/.gitignore
 delete mode 100644 fdep/test_project/Makefile.am
 delete mode 100755 fdep/test_project/autogen.sh
 delete mode 100644 fdep/test_project/configure.ac
 delete mode 120000 fdep/test_project/fdep
 delete mode 100644 fdep/test_project/src/bar.F90
 delete mode 100644 fdep/test_project/src/foo.F90
 delete mode 100644 fdep/test_project/src2/baz.F90
 delete mode 100644 m4/ax_check_gnu_make.m4
 delete mode 100644 m4/ax_elpa_gpu_version_only.m4
 delete mode 100644 m4/ax_elpa_openmp.m4
 delete mode 100644 m4/ax_elpa_specific_kernels.m4
 delete mode 100644 m4/ax_prog_cc_mpi.m4
 delete mode 100644 m4/ax_prog_cxx_mpi.m4
 delete mode 100644 m4/ax_prog_doxygen.m4
 delete mode 100644 m4/ax_prog_fc_mpi.m4
 delete mode 100644 man/get_elpa_communicators.3
 delete mode 100644 man/get_elpa_row_col_comms.3
 delete mode 100644 man/print_available_elpa2_kernels.1
 delete mode 100644 man/solve_evp_complex.3
 delete mode 100644 man/solve_evp_complex_1stage.3
 delete mode 100644 man/solve_evp_complex_2stage.3
 delete mode 100644 man/solve_evp_real.3
 delete mode 100644 man/solve_evp_real_1stage.3
 delete mode 100644 man/solve_evp_real_2stage.3
 delete mode 100644 src/elpa1.F90
 delete mode 100644 src/elpa1_compute.F90
 delete mode 100644 src/elpa2.F90
 delete mode 100644 src/elpa2_compute.F90
 delete mode 100644 src/elpa2_kernels/README_elpa2_kernels.txt
 delete mode 100644 src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
 delete mode 100644 src/elpa2_kernels/elpa2_kernels_complex.F90
 delete mode 100644 src/elpa2_kernels/elpa2_kernels_complex_simple.F90
 delete mode 100644 src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
 delete mode 100644 src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
 delete mode 100644 src/elpa2_kernels/elpa2_kernels_real.F90
 delete mode 100644 src/elpa2_kernels/elpa2_kernels_real_bgp.f90
 delete mode 100644 src/elpa2_kernels/elpa2_kernels_real_bgq.f90
 delete mode 100644 src/elpa2_kernels/elpa2_kernels_real_simple.F90
 delete mode 100644 src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
 delete mode 100644 src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
 delete mode 100644 src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
 delete mode 100644 src/elpa2_kernels/mod_single_hh_trafo_real.F90
 delete mode 100644 src/elpa2_utilities.F90
 delete mode 100644 src/elpa_c_interface.F90
 delete mode 100644 src/elpa_qr/elpa_pdgeqrf.F90
 delete mode 100644 src/elpa_qr/elpa_pdlarfb.f90
 delete mode 100644 src/elpa_qr/elpa_qrkernels.f90
 delete mode 100644 src/elpa_qr/qr_utils.f90
 delete mode 100644 src/elpa_reduce_add_vectors.X90
 delete mode 100644 src/elpa_transpose_vectors.X90
 delete mode 100644 src/elpa_utilities.F90
 delete mode 100644 src/ftimings/COPYING.LESSER
 delete mode 100644 src/ftimings/ftimings.F90
 delete mode 100644 src/ftimings/ftimings_type.F90
 delete mode 100644 src/ftimings/ftimings_value.F90
 delete mode 100644 src/ftimings/highwater_mark.c
 delete mode 100644 src/ftimings/papi.c
 delete mode 100644 src/ftimings/resident_set_size.c
 delete mode 100644 src/ftimings/time.c
 delete mode 100644 src/ftimings/virtual_memory.c
 delete mode 100644 src/mod_compute_hh_trafo_complex.F90
 delete mode 100644 src/mod_compute_hh_trafo_real.F90
 delete mode 100644 src/mod_pack_unpack_complex.F90
 delete mode 100644 src/mod_pack_unpack_real.F90
 delete mode 100644 src/mod_precision.f90
 delete mode 100644 src/print_available_elpa2_kernels.F90
 delete mode 100644 src/redist_band.X90
 delete mode 100644 src/timer.F90
 delete mode 100644 test/c_test_programs/elpa1_test_complex_c_version.c
 delete mode 100644 test/c_test_programs/elpa1_test_real_c_version.c
 delete mode 100644 test/c_test_programs/elpa2_test_complex_c_version.c
 delete mode 100644 test/c_test_programs/elpa2_test_real_c_version.c
 delete mode 100644 test/fortran_test_programs/read_real.F90
 delete mode 100644 test/fortran_test_programs/test_complex.F90
 delete mode 100644 test/fortran_test_programs/test_complex2.F90
 delete mode 100644 test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90
 delete mode 100644 test/fortran_test_programs/test_complex2_default_kernel.F90
 delete mode 100644 test/fortran_test_programs/test_real.F90
 delete mode 100644 test/fortran_test_programs/test_real2.F90
 delete mode 100644 test/fortran_test_programs/test_real2_choose_kernel_with_api.F90
 delete mode 100644 test/fortran_test_programs/test_real2_default_kernel.F90
 delete mode 100644 test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90
 delete mode 100644 test/fortran_test_programs/test_real_with_c.F90
 delete mode 100644 test/shared_sources/blacs_infrastructure.F90
 delete mode 100644 test/shared_sources/call_elpa1.c
 delete mode 100644 test/shared_sources/call_elpa2.c
 delete mode 100644 test/shared_sources/check_correctnes.F90
 delete mode 100644 test/shared_sources/mod_from_c.F90
 delete mode 100644 test/shared_sources/prepare_matrix.F90
 delete mode 100644 test/shared_sources/read_input_parameters.F90
 delete mode 100644 test/shared_sources/redir.c
 delete mode 100644 test/shared_sources/redirect.F90
 delete mode 100644 test/shared_sources/setup_mpi.F90
 delete mode 100644 test/shared_sources/util.F90
 delete mode 100644 test_project/Makefile.am
 delete mode 120000 test_project/autogen.sh
 delete mode 100644 test_project/configure.ac
 delete mode 120000 test_project/fdep
 delete mode 120000 test_project/m4/ax_prog_fc_mpi.m4
 delete mode 120000 test_project/src/test_real.F90

diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index 2cb2b5d6a..000000000
--- a/.gitignore
+++ /dev/null
@@ -1,21 +0,0 @@
-*.o
-*.lo
-*.deps
-*.dirstamp
-*.libs
-*.fortran_dependencies
-modules/
-Makefile.in
-aclocal.m4
-ar-lib
-autom4te.cache
-compile
-config.guess
-config.h.in
-config.sub
-configure
-depcomp
-install-sh
-ltmain.sh
-missing
-test-driver
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
deleted file mode 100644
index 30684aadb..000000000
--- a/.gitlab-ci.yml
+++ /dev/null
@@ -1,2 +0,0 @@
-jobs:
-  script: ./autogen.sh && ./configure && make && make check TEST_FLAGS='1500 50 16'
diff --git a/COPYING/COPYING b/COPYING/COPYING
deleted file mode 100644
index 870f85e0e..000000000
--- a/COPYING/COPYING
+++ /dev/null
@@ -1,30 +0,0 @@
-Licensing and copyright terms for the ELPA library:
-ELPA Consortium (2011)
-
-****
-
-Copyright of the original code rests with the authors inside the ELPA
-consortium. The copyright of any additional modifications shall rest
-with their original authors, but shall adhere to the licensing terms
-set forth below.
-
-****
-
-The code is distributed under the terms of the GNU Lesser General Public
-License version 3 (LGPL). The full text can be found in the file "lgpl.txt"
-in this repository. "lgpl.txt" makes reference to the GPL v3, which can also
-be found in this repository ("gpl.txt").
-
-****
-ELPA reflects a substantial effort on the part of the original
-ELPA consortium, and we ask you to respect the spirit of the
-license that we chose: i.e., please contribute any changes you
-may have back to the original ELPA library distribution, and keep
-any derivatives of ELPA under the same license that we chose for
-the original distribution, the GNU Lesser General Public License.
-
-When in doubt, talk to us. What we would like to ensure is that the ELPA
-code can be used as needed, while providing a strong incentive for
-others to contribute their modifications back to the original development.
-
-****
diff --git a/COPYING/gpl.txt b/COPYING/gpl.txt
deleted file mode 100644
index 94a9ed024..000000000
--- a/COPYING/gpl.txt
+++ /dev/null
@@ -1,674 +0,0 @@
-                    GNU GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-                            Preamble
-
-  The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.  We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors.  You can apply it to
-your programs, too.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
-  To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights.  Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
-
-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received.  You must make sure that they, too, receive
-or can get the source code.  And you must show them these terms so they
-know their rights.
-
-  Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
-
-  For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software.  For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-
-  Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so.  This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software.  The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable.  Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products.  If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
-  Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary.  To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-
-                       TERMS AND CONDITIONS
-
-  0. Definitions.
-
-  "This License" refers to version 3 of the GNU General Public License.
-
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
-  1. Source Code.
-
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
-  The Corresponding Source for a work in source code form is that
-same work.
-
-  2. Basic Permissions.
-
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
-  4. Conveying Verbatim Copies.
-
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
-  5. Conveying Modified Source Versions.
-
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
-  6. Conveying Non-Source Forms.
-
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
-  7. Additional Terms.
-
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
-  8. Termination.
-
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
-  9. Acceptance Not Required for Having Copies.
-
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
-  10. Automatic Licensing of Downstream Recipients.
-
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
-  11. Patents.
-
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
-  12. No Surrender of Others' Freedom.
-
-  If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
-  13. Use with the GNU Affero General Public License.
-
-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
-
-  14. Revised Versions of this License.
-
-  The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
-  If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
-  15. Disclaimer of Warranty.
-
-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. Limitation of Liability.
-
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
-  17. Interpretation of Sections 15 and 16.
-
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
-                     END OF TERMS AND CONDITIONS
-
-            How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-Also add information on how to contact you by electronic and paper mail.
-
-  If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
-    <program>  Copyright (C) <year>  <name of author>
-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-
-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-<http://www.gnu.org/licenses/>.
-
-  The GNU General Public License does not permit incorporating your program
-into proprietary programs.  If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library.  If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.  But first, please read
-<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/COPYING/lgpl.txt b/COPYING/lgpl.txt
deleted file mode 100644
index 65c5ca88a..000000000
--- a/COPYING/lgpl.txt
+++ /dev/null
@@ -1,165 +0,0 @@
-                   GNU LESSER GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-
-  This version of the GNU Lesser General Public License incorporates
-the terms and conditions of version 3 of the GNU General Public
-License, supplemented by the additional permissions listed below.
-
-  0. Additional Definitions.
-
-  As used herein, "this License" refers to version 3 of the GNU Lesser
-General Public License, and the "GNU GPL" refers to version 3 of the GNU
-General Public License.
-
-  "The Library" refers to a covered work governed by this License,
-other than an Application or a Combined Work as defined below.
-
-  An "Application" is any work that makes use of an interface provided
-by the Library, but which is not otherwise based on the Library.
-Defining a subclass of a class defined by the Library is deemed a mode
-of using an interface provided by the Library.
-
-  A "Combined Work" is a work produced by combining or linking an
-Application with the Library.  The particular version of the Library
-with which the Combined Work was made is also called the "Linked
-Version".
-
-  The "Minimal Corresponding Source" for a Combined Work means the
-Corresponding Source for the Combined Work, excluding any source code
-for portions of the Combined Work that, considered in isolation, are
-based on the Application, and not on the Linked Version.
-
-  The "Corresponding Application Code" for a Combined Work means the
-object code and/or source code for the Application, including any data
-and utility programs needed for reproducing the Combined Work from the
-Application, but excluding the System Libraries of the Combined Work.
-
-  1. Exception to Section 3 of the GNU GPL.
-
-  You may convey a covered work under sections 3 and 4 of this License
-without being bound by section 3 of the GNU GPL.
-
-  2. Conveying Modified Versions.
-
-  If you modify a copy of the Library, and, in your modifications, a
-facility refers to a function or data to be supplied by an Application
-that uses the facility (other than as an argument passed when the
-facility is invoked), then you may convey a copy of the modified
-version:
-
-   a) under this License, provided that you make a good faith effort to
-   ensure that, in the event an Application does not supply the
-   function or data, the facility still operates, and performs
-   whatever part of its purpose remains meaningful, or
-
-   b) under the GNU GPL, with none of the additional permissions of
-   this License applicable to that copy.
-
-  3. Object Code Incorporating Material from Library Header Files.
-
-  The object code form of an Application may incorporate material from
-a header file that is part of the Library.  You may convey such object
-code under terms of your choice, provided that, if the incorporated
-material is not limited to numerical parameters, data structure
-layouts and accessors, or small macros, inline functions and templates
-(ten or fewer lines in length), you do both of the following:
-
-   a) Give prominent notice with each copy of the object code that the
-   Library is used in it and that the Library and its use are
-   covered by this License.
-
-   b) Accompany the object code with a copy of the GNU GPL and this license
-   document.
-
-  4. Combined Works.
-
-  You may convey a Combined Work under terms of your choice that,
-taken together, effectively do not restrict modification of the
-portions of the Library contained in the Combined Work and reverse
-engineering for debugging such modifications, if you also do each of
-the following:
-
-   a) Give prominent notice with each copy of the Combined Work that
-   the Library is used in it and that the Library and its use are
-   covered by this License.
-
-   b) Accompany the Combined Work with a copy of the GNU GPL and this license
-   document.
-
-   c) For a Combined Work that displays copyright notices during
-   execution, include the copyright notice for the Library among
-   these notices, as well as a reference directing the user to the
-   copies of the GNU GPL and this license document.
-
-   d) Do one of the following:
-
-       0) Convey the Minimal Corresponding Source under the terms of this
-       License, and the Corresponding Application Code in a form
-       suitable for, and under terms that permit, the user to
-       recombine or relink the Application with a modified version of
-       the Linked Version to produce a modified Combined Work, in the
-       manner specified by section 6 of the GNU GPL for conveying
-       Corresponding Source.
-
-       1) Use a suitable shared library mechanism for linking with the
-       Library.  A suitable mechanism is one that (a) uses at run time
-       a copy of the Library already present on the user's computer
-       system, and (b) will operate properly with a modified version
-       of the Library that is interface-compatible with the Linked
-       Version.
-
-   e) Provide Installation Information, but only if you would otherwise
-   be required to provide such information under section 6 of the
-   GNU GPL, and only to the extent that such information is
-   necessary to install and execute a modified version of the
-   Combined Work produced by recombining or relinking the
-   Application with a modified version of the Linked Version. (If
-   you use option 4d0, the Installation Information must accompany
-   the Minimal Corresponding Source and Corresponding Application
-   Code. If you use option 4d1, you must provide the Installation
-   Information in the manner specified by section 6 of the GNU GPL
-   for conveying Corresponding Source.)
-
-  5. Combined Libraries.
-
-  You may place library facilities that are a work based on the
-Library side by side in a single library together with other library
-facilities that are not Applications and are not covered by this
-License, and convey such a combined library under terms of your
-choice, if you do both of the following:
-
-   a) Accompany the combined library with a copy of the same work based
-   on the Library, uncombined with any other library facilities,
-   conveyed under the terms of this License.
-
-   b) Give prominent notice with the combined library that part of it
-   is a work based on the Library, and explaining where to find the
-   accompanying uncombined form of the same work.
-
-  6. Revised Versions of the GNU Lesser General Public License.
-
-  The Free Software Foundation may publish revised and/or new versions
-of the GNU Lesser General Public License from time to time. Such new
-versions will be similar in spirit to the present version, but may
-differ in detail to address new problems or concerns.
-
-  Each version is given a distinguishing version number. If the
-Library as you received it specifies that a certain numbered version
-of the GNU Lesser General Public License "or any later version"
-applies to it, you have the option of following the terms and
-conditions either of that published version or of any later version
-published by the Free Software Foundation. If the Library as you
-received it does not specify a version number of the GNU Lesser
-General Public License, you may choose any version of the GNU Lesser
-General Public License ever published by the Free Software Foundation.
-
-  If the Library as you received it specifies that a proxy can decide
-whether future versions of the GNU Lesser General Public License shall
-apply, that proxy's public statement of acceptance of any version is
-permanent authorization for you to choose that version for the
-Library.
diff --git a/Doxyfile.in b/Doxyfile.in
deleted file mode 100644
index 69a562fca..000000000
--- a/Doxyfile.in
+++ /dev/null
@@ -1,2442 +0,0 @@
-# Doxyfile 1.8.10
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a double hash (##) is considered a comment and is placed in
-# front of the TAG it is preceding.
-#
-# All text after a single hash (#) is considered a comment and will be ignored.
-# The format is:
-# TAG = value [value, ...]
-# For lists, items can also be appended using:
-# TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (\" \").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
-# The default value is: UTF-8.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
-# double-quotes, unless you are using Doxywizard) that should identify the
-# project for which the documentation is generated. This name is used in the
-# title of most generated pages and in a few other places.
-# The default value is: My Project.
-
-PROJECT_NAME           = Eigenvalue SoLvers for Petaflop-Applications (ELPA)
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
-# could be handy for archiving the generated documentation or if some version
-# control system is used.
-
-PROJECT_NUMBER         = @PACKAGE_VERSION@
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description
-# for a project that appears at the top of each page and should give viewer a
-# quick idea about the purpose of the project. Keep the description short.
-
-0PROJECT_BRIEF          = "Eigenvalue SoLvers for Petaflop-Applications (ELPA)"
-
-# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
-# in the documentation. The maximum height of the logo should not exceed 55
-# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
-# the logo to the output directory.
-
-PROJECT_LOGO           =
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
-# into which the generated documentation will be written. If a relative path is
-# entered, it will be relative to the location where doxygen was started. If
-# left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       = @DOXYGEN_OUTPUT_DIR@
-
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
-# option can be useful when feeding doxygen a huge amount of source files, where
-# putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
-# The default value is: NO.
-
-CREATE_SUBDIRS         = NO
-
-# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
-# characters to appear in the names of generated files. If set to NO, non-ASCII
-# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
-# U+3044.
-# The default value is: NO.
-
-ALLOW_UNICODE_NAMES    = NO
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
-# The default value is: English.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
-# descriptions after the members that are listed in the file and class
-# documentation (similar to Javadoc). Set to NO to disable this.
-# The default value is: YES.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
-# description of a member or function before the detailed description
-#
-# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-# The default value is: YES.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator that is
-# used to form the text in various listings. Each string in this list, if found
-# as the leading text of the brief description, will be stripped from the text
-# and the result, after processing the whole list, is used as the annotated
-# text. Otherwise, the brief description is used as-is. If left blank, the
-# following values are used ($name is automatically replaced with the name of
-# the entity):The $name class, The $name widget, The $name file, is, provides,
-# specifies, contains, represents, a, an and the.
-
-ABBREVIATE_BRIEF       =
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# doxygen will generate a detailed section even if there is only a brief
-# description.
-# The default value is: NO.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-# The default value is: NO.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
-# before files name in the file list and in the header files. If set to NO the
-# shortest path that makes the file name unique will be used
-# The default value is: YES.
-
-FULL_PATH_NAMES        = YES
-
-# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
-# Stripping is only done if one of the specified strings matches the left-hand
-# part of the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the path to
-# strip.
-#
-# Note that you can specify absolute paths here, but also relative paths, which
-# will be relative from the directory where doxygen is started.
-# This tag requires that the tag FULL_PATH_NAMES is set to YES.
-
-STRIP_FROM_PATH        =
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
-# path mentioned in the documentation of a class, which tells the reader which
-# header file to include in order to use a class. If left blank only the name of
-# the header file containing the class definition is used. Otherwise one should
-# specify the list of include paths that are normally passed to the compiler
-# using the -I flag.
-
-STRIP_FROM_INC_PATH    =
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
-# less readable) file names. This can be useful is your file systems doesn't
-# support long names like on DOS, Mac, or CD-ROM.
-# The default value is: NO.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
-# first line (until the first dot) of a Javadoc-style comment as the brief
-# description. If set to NO, the Javadoc-style will behave just like regular Qt-
-# style comments (thus requiring an explicit @brief command for a brief
-# description.)
-# The default value is: NO.
-
-JAVADOC_AUTOBRIEF      = NO
-
-# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
-# line (until the first dot) of a Qt-style comment as the brief description. If
-# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
-# requiring an explicit \brief command for a brief description.)
-# The default value is: NO.
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
-# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
-# a brief description. This used to be the default behavior. The new default is
-# to treat a multi-line C++ comment block as a detailed description. Set this
-# tag to YES if you prefer the old behavior instead.
-#
-# Note that setting this tag to YES also means that rational rose comments are
-# not recognized any more.
-# The default value is: NO.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
-# documentation from any documented member that it re-implements.
-# The default value is: YES.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
-# page for each member. If set to NO, the documentation of a member will be part
-# of the file/class/namespace that contains it.
-# The default value is: NO.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
-# uses this value to replace tabs by spaces in code fragments.
-# Minimum value: 1, maximum value: 16, default value: 4.
-
-TAB_SIZE               = 4
-
-# This tag can be used to specify a number of aliases that act as commands in
-# the documentation. An alias has the form:
-# name=value
-# For example adding
-# "sideeffect=@par Side Effects:\n"
-# will allow you to put the command \sideeffect (or @sideeffect) in the
-# documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
-
-ALIASES                =
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
-# only. Doxygen will then generate output that is more tailored for C. For
-# instance, some of the names that are used will be different. The list of all
-# members will be omitted, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
-# Python sources only. Doxygen will then generate output that is more tailored
-# for that language. For instance, namespaces will be presented as packages,
-# qualified scopes will look different, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources. Doxygen will then generate output that is tailored for Fortran.
-# The default value is: NO.
-
-OPTIMIZE_FOR_FORTRAN   = Yes
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for VHDL.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given
-# extension. Doxygen has a built-in mapping, but you can override or extend it
-# using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
-#
-# Note: For files without extension you can use no_extension as a placeholder.
-#
-# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
-
-EXTENSION_MAPPING      = f=FortranFixed f90=FortranFree F=FortranFixed F90=FortranFree
-
-# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
-# according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
-# The output of markdown processing is further processed by doxygen, so you can
-# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
-# case of backward compatibilities issues.
-# The default value is: YES.
-
-MARKDOWN_SUPPORT       = YES
-
-# When enabled doxygen tries to link words that correspond to documented
-# classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by putting a % sign in front of the word or
-# globally by setting AUTOLINK_SUPPORT to NO.
-# The default value is: YES.
-
-AUTOLINK_SUPPORT       = YES
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should set this
-# tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string);
-# versus func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-# The default value is: NO.
-
-BUILTIN_STL_SUPPORT    = NO
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-# The default value is: NO.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
-# The default value is: NO.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate
-# getter and setter methods for a property. Setting this option to YES will make
-# doxygen to replace the get and set methods by a property in the documentation.
-# This will only work if the methods are indeed getting or setting a simple
-# type. If this is not the case, or you want to show the methods anyway, you
-# should set this option to NO.
-# The default value is: YES.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-# The default value is: NO.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# If one adds a struct or class to a group and this option is enabled, then also
-# any nested class or struct is added to the same group. By default this option
-# is disabled and one has to add nested compounds explicitly via \ingroup.
-# The default value is: NO.
-
-GROUP_NESTED_COMPOUNDS = NO
-
-# Set the SUBGROUPING tag to YES to allow class member groups of the same type
-# (for instance a group of public functions) to be put as a subgroup of that
-# type (e.g. under the Public Functions section). Set it to NO to prevent
-# subgrouping. Alternatively, this can be done per class using the
-# \nosubgrouping command.
-# The default value is: YES.
-
-SUBGROUPING            = YES
-
-# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
-# are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
-# and RTF).
-#
-# Note that this feature does not work in combination with
-# SEPARATE_MEMBER_PAGES.
-# The default value is: NO.
-
-INLINE_GROUPED_CLASSES = NO
-
-# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
-# with only public data fields or simple typedef fields will be shown inline in
-# the documentation of the scope in which they are defined (i.e. file,
-# namespace, or group documentation), provided this scope is documented. If set
-# to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
-# The default value is: NO.
-
-INLINE_SIMPLE_STRUCTS  = NO
-
-# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
-# enum is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically be
-# useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-# The default value is: NO.
-
-TYPEDEF_HIDES_STRUCT   = NO
-
-# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
-# cache is used to resolve symbols given their name and scope. Since this can be
-# an expensive process and often the same symbol appears multiple times in the
-# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
-# doxygen will become slower. If the cache is too large, memory is wasted. The
-# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
-# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
-# symbols. At the end of a run doxygen will report the cache usage and suggest
-# the optimal cache size from a speed point of view.
-# Minimum value: 0, maximum value: 9, default value: 0.
-
-LOOKUP_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
-# documentation are documented, even if no documentation was available. Private
-# class members and static file members will be hidden unless the
-# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
-# Note: This will also disable the warnings about undocumented members that are
-# normally produced when WARNINGS is set to YES.
-# The default value is: NO.
-
-EXTRACT_ALL            = YES
-
-# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
-# be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PRIVATE        = YES
-
-# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
-# scope will be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PACKAGE        = YES
-
-# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
-# included in the documentation.
-# The default value is: NO.
-
-EXTRACT_STATIC         = YES
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO,
-# only classes defined in header files are included. Does not have any effect
-# for Java sources.
-# The default value is: YES.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# This flag is only useful for Objective-C code. If set to YES, local methods,
-# which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO, only methods in the interface are
-# included.
-# The default value is: NO.
-
-EXTRACT_LOCAL_METHODS  = NO
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base name of
-# the file that contains the anonymous namespace. By default anonymous namespace
-# are hidden.
-# The default value is: NO.
-
-EXTRACT_ANON_NSPACES   = NO
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
-# undocumented members inside documented classes or files. If set to NO these
-# members will be included in the various overviews, but no documentation
-# section is generated. This option has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy. If set
-# to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
-# The default value is: NO.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO, these
-# blocks will be appended to the function's detailed documentation block.
-# The default value is: NO.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation that is typed after a
-# \internal command is included. If the tag is set to NO then the documentation
-# will be excluded. Set it to YES to include the internal documentation.
-# The default value is: NO.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
-
-CASE_SENSE_NAMES       = YES
-
-# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES, the
-# scope will be hidden.
-# The default value is: NO.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
-# append additional text to a page's title, such as Class Reference. If set to
-# YES the compound reference will be hidden.
-# The default value is: NO.
-
-HIDE_COMPOUND_REFERENCE= NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
-# the files that are included by a file in the documentation of that file.
-# The default value is: YES.
-
-SHOW_INCLUDE_FILES     = YES
-
-# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
-# grouped member an include statement to the documentation, telling the reader
-# which file to include in order to use the member.
-# The default value is: NO.
-
-SHOW_GROUPED_MEMB_INC  = NO
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
-# files with double quotes in the documentation rather than with sharp brackets.
-# The default value is: NO.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
-# documentation for inline members.
-# The default value is: YES.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
-# (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order.
-# The default value is: YES.
-
-SORT_MEMBER_DOCS       = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
-# descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order. Note that
-# this will also influence the order of the classes in the class list.
-# The default value is: NO.
-
-SORT_BRIEF_DOCS        = NO
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
-# (brief and detailed) documentation of class members so that constructors and
-# destructors are listed first. If set to NO the constructors will appear in the
-# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
-# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
-# member documentation.
-# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
-# detailed member documentation.
-# The default value is: NO.
-
-SORT_MEMBERS_CTORS_1ST = NO
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
-# of group names into alphabetical order. If set to NO the group names will
-# appear in their defined order.
-# The default value is: NO.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
-# fully-qualified names, including namespaces. If set to NO, the class list will
-# be sorted only by class name, not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the alphabetical
-# list.
-# The default value is: NO.
-
-SORT_BY_SCOPE_NAME     = NO
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
-# type resolution of all parameters of a function it will reject a match between
-# the prototype and the implementation of a member function even if there is
-# only one candidate or it is obvious which candidate to choose by doing a
-# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
-# accept a match between prototype and implementation in such cases.
-# The default value is: NO.
-
-STRICT_PROTO_MATCHING  = NO
-
-# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
-# list. This list is created by putting \todo commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
-# list. This list is created by putting \test commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
-# list. This list is created by putting \bug commands in the documentation.
-# The default value is: YES.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
-# the deprecated list. This list is created by putting \deprecated commands in
-# the documentation.
-# The default value is: YES.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional documentation
-# sections, marked by \if <section_label> ... \endif and \cond <section_label>
-# ... \endcond blocks.
-
-ENABLED_SECTIONS       =
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
-# initial value of a variable or macro / define can have for it to appear in the
-# documentation. If the initializer consists of more lines than specified here
-# it will be hidden. Use a value of 0 to hide initializers completely. The
-# appearance of the value of individual variables and macros / defines can be
-# controlled using \showinitializer or \hideinitializer command in the
-# documentation regardless of this setting.
-# Minimum value: 0, maximum value: 10000, default value: 30.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES, the
-# list will mention the files that were used to generate the documentation.
-# The default value is: YES.
-
-SHOW_USED_FILES        = YES
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
-# will remove the Files entry from the Quick Index and from the Folder Tree View
-# (if specified).
-# The default value is: YES.
-
-SHOW_FILES             = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
-# page. This will remove the Namespaces entry from the Quick Index and from the
-# Folder Tree View (if specified).
-# The default value is: YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command command input-file, where command is the value of the
-# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
-# by doxygen. Whatever the program writes to standard output is used as the file
-# version. For an example see the documentation.
-
-FILE_VERSION_FILTER    =
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. To create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option. You can
-# optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
-#
-# Note that if you run doxygen from a directory containing a file called
-# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
-# tag is left empty.
-
-LAYOUT_FILE            =
-
-# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
-# the reference definitions. This must be a list of .bib files. The .bib
-# extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
-# For LaTeX the style of the bibliography can be controlled using
-# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
-# search path. See also \cite for info how to create references.
-
-CITE_BIB_FILES         =
-
-#---------------------------------------------------------------------------
-# Configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated to
-# standard output by doxygen. If QUIET is set to YES this implies that the
-# messages are off.
-# The default value is: NO.
-
-QUIET                  = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
-# this implies that the warnings are on.
-#
-# Tip: Turn warnings on while writing the documentation.
-# The default value is: YES.
-
-WARNINGS               = YES
-
-# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
-# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
-# will automatically be disabled.
-# The default value is: YES.
-
-WARN_IF_UNDOCUMENTED   = YES
-
-# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
-# The default value is: YES.
-
-WARN_IF_DOC_ERROR      = YES
-
-# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
-# are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
-# The default value is: NO.
-
-WARN_NO_PARAMDOC       = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that doxygen
-# can produce. The string should contain the $file, $line, and $text tags, which
-# will be replaced by the file and line number from which the warning originated
-# and the warning text. Optionally the format may contain $version, which will
-# be replaced by the version of the file (if it could be obtained via
-# FILE_VERSION_FILTER)
-# The default value is: $file:$line: $text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning and error
-# messages should be written. If left blank the output is written to standard
-# error (stderr).
-
-WARN_LOGFILE           =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag is used to specify the files and/or directories that contain
-# documented source files. You may enter file names like myfile.cpp or
-# directories like /usr/src/myproject. Separate the files or directories with
-# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
-# Note: If this tag is empty the current directory is searched.
-
-INPUT                  = @top_srcdir@/src @top_srcdir@/test @builddir@/elpa
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
-# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
-# The default value is: UTF-8.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# read by doxygen.
-#
-# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
-# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd,
-# *.vhdl, *.ucf, *.qsf, *.as and *.js.
-
-FILE_PATTERNS          =
-
-# The RECURSIVE tag can be used to specify whether or not subdirectories should
-# be searched for input files as well.
-# The default value is: NO.
-
-RECURSIVE              = YES
-
-# The EXCLUDE tag can be used to specify files and/or directories that should be
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-#
-# Note that relative paths are relative to the directory from which doxygen is
-# run.
-
-EXCLUDE                = @top_srcdir@/src/elpa1_compute.F90 \
-                         @top_srcdir@/src/elpa2_compute.F90 \
-			 @top_srcdir@/src/elpa2_utilities.F90 \
-			 @top_srcdir@/src/elpa_c_interface.F90 \
-			 @top_srcdir@/src/elpa_reduce_add_vectors.X90 \
-			 @top_srcdir@/src/elpa_transpose_vectors.X90 \
-			 @top_srcdir@/src/elpa_utilities.F90 \
-			 @top_srcdir@/src/timer.F90 \
-			 @top_srcdir@/src/redist_band.X90 \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_complex.f90 \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real.f90 \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_simple.f90 \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c \
-			 @top_srcdir@/src/elpa_qr/elpa_pdgeqrf.f90 \
-			 @top_srcdir@/src/elpa_qr/elpa_pdlarfb.f90 \
-			 @top_srcdir@/src/elpa_qr/elpa_qrkernels.f90 \
-			 @top_srcdir@/src/elpa_qr/qr_utils.f90 \
-			 @top_srcdir@/src/ftimings/ftimings.F90 \
-			 @top_srcdir@/src/ftimings/ftimings_type.F90 \
-			 @top_srcdir@/src/ftimings/ftimings_value.F90 \
-			 @top_srcdir@/src/ftimings/highwater_mark.c \
-			 @top_srcdir@/src/ftimings/papi.c \
-			 @top_srcdir@/src/ftimings/resident_set_size.c \
-			 @top_srcdir@/src/ftimings/time.c \
-			 @top_srcdir@/src/ftimings/virtual_memory.c \
-			 @top_srcdir@/test/c_test_programs/elpa1_test_complex_c_version.c \
-			 @top_srcdir@/test/c_test_programs/elpa1_test_real_c_version.c \
-			 @top_srcdir@/test/c_test_programs/elpa2_test_complex_c_version.c \
-			 @top_srcdir@/test/c_test_programs/elpa2_test_real_c_version.c \
-			 @top_srcdir@/test/fortran_test_programs/read_real.F90 \
-			 @top_srcdir@/test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 \
-			 @top_srcdir@/test/fortran_test_programs/test_complex2_default_kernel.F90 \
-                         @top_srcdir@/test/fortran_test_programs/test_complex2.F90 \
-			 @top_srcdir@/test/fortran_test_programs/test_complex.F90 \
-			 @top_srcdir@/test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 \
-			 @top_srcdir@/test/fortran_test_programs/test_real2_default_kernel.F90 \
-			 @top_srcdir@/test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90 \
-			 @top_srcdir@/test/fortran_test_programs/test_real2.F90 \
-			 @top_srcdir@/test/fortran_test_programs/test_real.F90 \
-			 @top_srcdir@/test/fortran_test_programs/test_real_with_c.F90 \
-			 @top_srcdir@/test/shared_sources/blacs_infrastructure.F90 \
-			 @top_srcdir@/test/shared_sources/call_elpa1.c \
-			 @top_srcdir@/test/shared_sources/call_elpa2.c \
-			 @top_srcdir@/test/shared_sources/check_correctnes.F90 \
-			 @top_srcdir@/test/shared_sources/mod_from_c.F90 \
-			 @top_srcdir@/test/shared_sources/prepare_matrix.F90 \
-			 @top_srcdir@/test/shared_sources/read_input_parameters.F90 \
-			 @top_srcdir@/test/shared_sources/redir.c \
-			 @top_srcdir@/test/shared_sources/redirect.F90 \
-			 @top_srcdir@/test/shared_sources/setup_mpi.F90 \
-			 @top_srcdir@/test/shared_sources/util.F90
-
-# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-# The default value is: NO.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories.
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       =
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
-
-EXCLUDE_SYMBOLS        =
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or directories
-# that contain example code fragments that are included (see the \include
-# command).
-
-EXAMPLE_PATH           =
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank all
-# files are included.
-
-EXAMPLE_PATTERNS       =
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude commands
-# irrespective of the value of the RECURSIVE tag.
-# The default value is: NO.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or directories
-# that contain images that are to be included in the documentation (see the
-# \image command).
-
-IMAGE_PATH             =
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command:
-#
-# <filter> <input-file>
-#
-# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
-# name of an input file. Doxygen will then use the output that the filter
-# program writes to standard output. If FILTER_PATTERNS is specified, this tag
-# will be ignored.
-#
-# Note that the filter must not add or remove lines; it is applied before the
-# code is scanned, but not when the output code is generated. If lines are added
-# or removed, the anchors will not be placed correctly.
-
-INPUT_FILTER           =
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis. Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match. The filters are a list of the form: pattern=filter
-# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
-# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
-# patterns match the file name, INPUT_FILTER is applied.
-
-FILTER_PATTERNS        =
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER) will also be used to filter the input files that are used for
-# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
-# The default value is: NO.
-
-FILTER_SOURCE_FILES    = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
-# it is also possible to disable source filtering for a specific pattern using
-# *.ext= (so without naming a filter).
-# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
-
-FILTER_SOURCE_PATTERNS =
-
-# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
-# is part of the input, its contents will be placed on the main page
-# (index.html). This can be useful if you have a project on for instance GitHub
-# and want to reuse the introduction page also for the doxygen output.
-
-USE_MDFILE_AS_MAINPAGE =
-
-#---------------------------------------------------------------------------
-# Configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
-# generated. Documented entities will be cross-referenced with these sources.
-#
-# Note: To get rid of all source code in the generated output, make sure that
-# also VERBATIM_HEADERS is set to NO.
-# The default value is: NO.
-
-SOURCE_BROWSER         = NO
-
-# Setting the INLINE_SOURCES tag to YES will include the body of functions,
-# classes and enums directly into the documentation.
-# The default value is: NO.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
-# special comment blocks from generated source code fragments. Normal C, C++ and
-# Fortran comments will always remain visible.
-# The default value is: YES.
-
-STRIP_CODE_COMMENTS    = YES
-
-# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
-# The default value is: NO.
-
-REFERENCED_BY_RELATION = NO
-
-# If the REFERENCES_RELATION tag is set to YES then for each documented function
-# all documented entities called/used by that function will be listed.
-# The default value is: NO.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES then the hyperlinks from functions in REFERENCES_RELATION and
-# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
-# link to the documentation.
-# The default value is: YES.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
-# source code will show a tooltip with additional information such as prototype,
-# brief description and links to the definition and documentation. Since this
-# will make the HTML file larger and loading of large files a bit slower, you
-# can opt to disable this feature.
-# The default value is: YES.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-SOURCE_TOOLTIPS        = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code will
-# point to the HTML generated by the htags(1) tool instead of doxygen built-in
-# source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
-# 4.8.6 or higher.
-#
-# To use it do the following:
-# - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
-# - Make sure the INPUT points to the root of the source tree
-# - Run doxygen as normal
-#
-# Doxygen will invoke htags (and that will in turn invoke gtags), so these
-# tools must be available from the command line (i.e. in the search path).
-#
-# The result: instead of the source browser generated by doxygen, the links to
-# source code will now point to the output of htags.
-# The default value is: NO.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
-# verbatim copy of the header file for each class for which an include is
-# specified. Set to NO to disable this.
-# See also: Section \class.
-# The default value is: YES.
-
-VERBATIM_HEADERS       = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
-# compounds will be generated. Enable this if the project contains a lot of
-# classes, structs, unions or interfaces.
-# The default value is: YES.
-
-ALPHABETICAL_INDEX     = YES
-
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-IGNORE_PREFIX          =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
-# The default value is: YES.
-
-GENERATE_HTML          = YES
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
-# generated HTML page (for example: .htm, .php, .asp).
-# The default value is: .html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
-# each generated HTML page. If the tag is left blank doxygen will generate a
-# standard header.
-#
-# To get valid HTML the header file that includes any scripts and style sheets
-# that doxygen needs, which is dependent on the configuration options used (e.g.
-# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
-# default header using
-# doxygen -w html new_header.html new_footer.html new_stylesheet.css
-# YourConfigFile
-# and then modify the file new_header.html. See also section "Doxygen usage"
-# for information on how to generate the default header that doxygen normally
-# uses.
-# Note: The header is subject to change so you typically have to regenerate the
-# default header when upgrading to a newer version of doxygen. For a description
-# of the possible markers and block names see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_HEADER            =
-
-# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
-# generated HTML page. If the tag is left blank doxygen will generate a standard
-# footer. See HTML_HEADER for more information on how to generate a default
-# footer and what special commands can be used inside the footer. See also
-# section "Doxygen usage" for information on how to generate the default footer
-# that doxygen normally uses.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FOOTER            =
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
-# sheet that is used by each HTML page. It can be used to fine-tune the look of
-# the HTML output. If left blank doxygen will generate a default style sheet.
-# See also section "Doxygen usage" for information on how to generate the style
-# sheet that doxygen normally uses.
-# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
-# it is more robust and this tag (HTML_STYLESHEET) will in the future become
-# obsolete.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_STYLESHEET        =
-
-# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# cascading style sheets that are included after the standard style sheets
-# created by doxygen. Using this option one can overrule certain style aspects.
-# This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefore more robust against future updates.
-# Doxygen will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_STYLESHEET  =
-
-# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the HTML output directory. Note
-# that these files will be copied to the base HTML output directory. Use the
-# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
-# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
-# files will be copied as-is; there are no commands or markers available.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_FILES       =
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
-# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
-# purple, and 360 is red again.
-# Minimum value: 0, maximum value: 359, default value: 220.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_HUE    = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
-# value of 255 will produce the most vivid colors.
-# Minimum value: 0, maximum value: 255, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_SAT    = 100
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
-# luminance component of the colors in the HTML output. Values below 100
-# gradually make the output lighter, whereas values above 100 make the output
-# darker. The value divided by 100 is the actual gamma applied, so 80 represents
-# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
-# change the gamma.
-# Minimum value: 40, maximum value: 240, default value: 80.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_GAMMA  = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_DYNAMIC_SECTIONS  = NO
-
-# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
-# shown in the various tree structured indices initially; the user can expand
-# and collapse entries dynamically later on. Doxygen will expand the tree to
-# such a level that at most the specified number of entries are visible (unless
-# a fully collapsed tree already exceeds this amount). So setting the number of
-# entries 1 will produce a full collapsed tree by default. 0 is a special value
-# representing an infinite number of entries and will result in a full expanded
-# tree by default.
-# Minimum value: 0, maximum value: 9999, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_INDEX_NUM_ENTRIES = 100
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files will be
-# generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_DOCSET        = NO
-
-# This tag determines the name of the docset feed. A documentation feed provides
-# an umbrella under which multiple documentation sets from a single provider
-# (such as a company or product suite) can be grouped.
-# The default value is: Doxygen generated docs.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# This tag specifies a string that should uniquely identify the documentation
-# set bundle. This should be a reverse domain-name style string, e.g.
-# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
-# the documentation publisher. This should be a reverse domain-name style
-# string, e.g. com.mycompany.MyDocSet.documentation.
-# The default value is: org.doxygen.Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-
-# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
-# The default value is: Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_NAME  = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
-# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
-# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
-#
-# The HTML Help Workshop contains a compiler that can convert all HTML output
-# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
-# files are now used as the Windows 98 help format, and will replace the old
-# Windows help format (.hlp) on all Windows platforms in the future. Compressed
-# HTML files also contain an index, a table of contents, and you can search for
-# words in the documentation. The HTML workshop also contains a viewer for
-# compressed HTML files.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_HTMLHELP      = NO
-
-# The CHM_FILE tag can be used to specify the file name of the resulting .chm
-# file. You can add a path in front of the file if the result should not be
-# written to the html output directory.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_FILE               =
-
-# The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler (hhc.exe). If non-empty,
-# doxygen will try to run the HTML help compiler on the generated index.hhp.
-# The file has to be specified with full path.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-HHC_LOCATION           =
-
-# The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-GENERATE_CHI           = NO
-
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
-# and project file content.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_INDEX_ENCODING     =
-
-# The BINARY_TOC flag controls whether a binary table of contents is generated
-# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
-# enables the Previous and Next buttons.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members to
-# the table of contents of the HTML help documentation and to the tree view.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
-# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
-# (.qch) of the generated HTML documentation.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
-# the file name of the resulting .qch file. The path specified is relative to
-# the HTML output folder.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QCH_FILE               =
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
-# Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
-# Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
-# The default value is: doc.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
-# filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_NAME   =
-
-# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_ATTRS  =
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_SECT_FILTER_ATTRS  =
-
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHG_LOCATION           =
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
-# generated, together with the HTML files, they form an Eclipse help plugin. To
-# install this plugin and make it available under the help contents menu in
-# Eclipse, the contents of the directory containing the HTML and XML files needs
-# to be copied into the plugins directory of eclipse. The name of the directory
-# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
-# After copying Eclipse needs to be restarted before the help appears.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the Eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have this
-# name. Each documentation set should have its own identifier.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# If you want full control over the layout of the generated HTML pages it might
-# be necessary to disable the index and replace it with your own. The
-# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
-# of each HTML page. A value of NO enables the index and the value YES disables
-# it. Since the tabs in the index contain the same information as the navigation
-# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-DISABLE_INDEX          = NO
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information. If the tag
-# value is set to YES, a side panel will be generated containing a tree-like
-# index structure (just like the one that is generated for HTML Help). For this
-# to work a browser that supports JavaScript, DHTML, CSS and frames is required
-# (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_TREEVIEW      = NO
-
-# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
-# doxygen will group on one line in the generated HTML documentation.
-#
-# Note that a value of 0 will completely suppress the enum values from appearing
-# in the overview section.
-# Minimum value: 0, maximum value: 20, default value: 4.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-ENUM_VALUES_PER_LINE   = 4
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
-# to set the initial width (in pixels) of the frame in which the tree is shown.
-# Minimum value: 0, maximum value: 1500, default value: 250.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-TREEVIEW_WIDTH         = 250
-
-# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
-# external symbols imported via tag files in a separate window.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-EXT_LINKS_IN_WINDOW    = NO
-
-# Use this tag to change the font size of LaTeX formulas included as images in
-# the HTML documentation. When you change the font size after a successful
-# doxygen run you need to manually remove any form_*.png images from the HTML
-# output directory to force them to be regenerated.
-# Minimum value: 8, maximum value: 50, default value: 10.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_FONTSIZE       = 10
-
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
-# installed or if you want to formulas look prettier in the HTML output. When
-# enabled you may also need to install MathJax separately and configure the path
-# to it using the MATHJAX_RELPATH option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-USE_MATHJAX            = NO
-
-# When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
-# Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
-# The default value is: HTML-CSS.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_FORMAT         = HTML-CSS
-
-# When MathJax is enabled you need to specify the location relative to the HTML
-# output directory using the MATHJAX_RELPATH option. The destination directory
-# should contain the MathJax.js script. For instance, if the mathjax directory
-# is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
-# Content Delivery Network so you can quickly see the result without installing
-# MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-
-# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
-# extension names that should be enabled during MathJax rendering. For example
-# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_EXTENSIONS     =
-
-# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
-# of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
-# example see the documentation.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_CODEFILE       =
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
-# the HTML output. The underlying search engine uses javascript and DHTML and
-# should work on any modern browser. Note that when using HTML help
-# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
-# there is already a search function so this one should typically be disabled.
-# For large projects the javascript based search engine can be slow, then
-# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
-# search using the keyboard; to jump to the search box use <access key> + S
-# (what the <access key> is depends on the OS and browser, but it is typically
-# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
-# key> to jump into the search results window, the results can be navigated
-# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
-# the search. The filter options can be selected when the cursor is inside the
-# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
-# to select a filter and <Enter> or <escape> to activate or cancel the filter
-# option.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-SEARCHENGINE           = YES
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
-# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
-# setting. When disabled, doxygen will generate a PHP script for searching and
-# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
-# and searching needs to be provided by external tools. See the section
-# "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SERVER_BASED_SEARCH    = NO
-
-# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
-# script for searching. Instead the search results are written to an XML file
-# which needs to be processed by an external indexer. Doxygen will invoke an
-# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
-# search results.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
-#
-# See the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH        = NO
-
-# The SEARCHENGINE_URL should point to a search engine hosted by a web server
-# which will return the search results when EXTERNAL_SEARCH is enabled.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHENGINE_URL       =
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
-# search data is written to a file for indexing by an external tool. With the
-# SEARCHDATA_FILE tag the name of this file can be specified.
-# The default file is: searchdata.xml.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHDATA_FILE        = searchdata.xml
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
-# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
-# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
-# projects and redirect the results back to the right project.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH_ID     =
-
-# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
-# projects other than the one defined by this configuration file, but that are
-# all added to the same external search index. Each project needs to have a
-# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
-# to a relative location where the documentation can be found. The format is:
-# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTRA_SEARCH_MAPPINGS  =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
-# The default value is: YES.
-
-GENERATE_LATEX         = YES
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_OUTPUT           = latex
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
-# invoked.
-#
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_CMD_NAME         = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
-# index for LaTeX.
-# The default file is: makeindex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-MAKEINDEX_CMD_NAME     = makeindex
-
-# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used by the
-# printer.
-# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
-# 14 inches) and executive (7.25 x 10.5 inches).
-# The default value is: a4.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PAPER_TYPE             = a4
-
-# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
-# that should be included in the LaTeX output. The package can be specified just
-# by its name or with the correct syntax as to be used with the LaTeX
-# \usepackage command. To get the times font for instance you can specify :
-# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
-# To use the option intlimits with the amsmath package you can specify:
-# EXTRA_PACKAGES=[intlimits]{amsmath}
-# If left blank no extra packages will be included.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-EXTRA_PACKAGES         =
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
-#
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
-# string, for the replacement values of the other commands the user is referred
-# to HTML_HEADER.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HEADER           =
-
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer. See
-# LATEX_HEADER for more information on how to generate a default footer and what
-# special commands can be used inside the footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_FOOTER           =
-
-# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# LaTeX style sheets that are included after the standard style sheets created
-# by doxygen. Using this option one can overrule certain style aspects. Doxygen
-# will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list).
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_STYLESHEET =
-
-# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the LATEX_OUTPUT output
-# directory. Note that the files will be copied as-is; there are no commands or
-# markers available.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_FILES      =
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
-# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
-# contain links (just like the HTML output) instead of page references. This
-# makes the output suitable for online browsing using a PDF viewer.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PDF_HYPERLINKS         = YES
-
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-USE_PDFLATEX           = YES
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BATCHMODE        = NO
-
-# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
-# index chapters (such as File Index, Compound Index, etc.) in the output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HIDE_INDICES     = NO
-
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
-# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
-# bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
-# The default value is: plain.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BIB_STYLE        = plain
-
-#---------------------------------------------------------------------------
-# Configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
-# RTF output is optimized for Word 97 and may not look too pretty with other RTF
-# readers/editors.
-# The default value is: NO.
-
-GENERATE_RTF           = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: rtf.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_OUTPUT             = rtf
-
-# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
-# contain hyperlink fields. The RTF file will contain links (just like the HTML
-# output) instead of page references. This makes the output suitable for online
-# browsing using Word or some other Word compatible readers that support those
-# fields.
-#
-# Note: WordPad (write) and others do not support links.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_HYPERLINKS         = NO
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
-#
-# See also section "Doxygen usage" for information on how to generate the
-# default style sheet that doxygen normally uses.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_STYLESHEET_FILE    =
-
-# Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_EXTENSIONS_FILE    =
-
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_SOURCE_CODE        = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
-# classes and files.
-# The default value is: NO.
-
-GENERATE_MAN           = YES
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it. A directory man3 will be created inside the directory specified by
-# MAN_OUTPUT.
-# The default directory is: man.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_OUTPUT             = man
-
-# The MAN_EXTENSION tag determines the extension that is added to the generated
-# man pages. In case the manual section does not start with a number, the number
-# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
-# optional.
-# The default value is: .3.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_EXTENSION          = .3
-
-# The MAN_SUBDIR tag determines the name of the directory created within
-# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
-# MAN_EXTENSION with the initial . removed.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_SUBDIR             =
-
-# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
-# will generate one additional man file for each entity documented in the real
-# man page(s). These additional files only source the real man page, but without
-# them the man command would be unable to find the correct page.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
-# captures the structure of the code including all documentation.
-# The default value is: NO.
-
-GENERATE_XML           = NO
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: xml.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_OUTPUT             = xml
-
-# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
-# listings (including syntax highlighting and cross-referencing information) to
-# the XML output. Note that enabling this will significantly increase the size
-# of the XML output.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_PROGRAMLISTING     = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the DOCBOOK output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
-# that can be used to generate PDF.
-# The default value is: NO.
-
-GENERATE_DOCBOOK       = NO
-
-# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
-# front of it.
-# The default directory is: docbook.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_OUTPUT         = docbook
-
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
-#---------------------------------------------------------------------------
-# Configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
-# file that captures the structure of the code including all documentation.
-#
-# Note that this feature is still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_PERLMOD       = NO
-
-# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
-# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
-# output from the Perl module output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_LATEX          = NO
-
-# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
-# formatted so it can be parsed by a human reader. This is useful if you want to
-# understand what is going on. On the other hand, if this tag is set to NO, the
-# size of the Perl module output will be much smaller and Perl will parse it
-# just the same.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_PRETTY         = YES
-
-# The names of the make variables in the generated doxyrules.make file are
-# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
-# so different doxyrules.make files included by the same Makefile don't
-# overwrite each other's variables.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_MAKEVAR_PREFIX =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
-# C-preprocessor directives found in the sources and include files.
-# The default value is: YES.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
-# in the source code. If set to NO, only conditional compilation will be
-# performed. Macro expansion can be done in a controlled way by setting
-# EXPAND_ONLY_PREDEF to YES.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-MACRO_EXPANSION        = NO
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
-# the macro expansion is limited to the macros specified with the PREDEFINED and
-# EXPAND_AS_DEFINED tags.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_ONLY_PREDEF     = NO
-
-# If the SEARCH_INCLUDES tag is set to YES, the include files in the
-# INCLUDE_PATH will be searched if a #include is found.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SEARCH_INCLUDES        = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by the
-# preprocessor.
-# This tag requires that the tag SEARCH_INCLUDES is set to YES.
-
-INCLUDE_PATH           =
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will be
-# used.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-INCLUDE_FILE_PATTERNS  =
-
-# The PREDEFINED tag can be used to specify one or more macro names that are
-# defined before the preprocessor is started (similar to the -D option of e.g.
-# gcc). The argument of the tag is a list of macros of the form: name or
-# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
-# is assumed. To prevent a macro definition from being undefined via #undef or
-# recursively expanded use the := operator instead of the = operator.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-PREDEFINED             =
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
-# tag can be used to specify a list of macro names that should be expanded. The
-# macro definition that is found in the sources will be used. Use the PREDEFINED
-# tag if you want to use a different macro definition that overrules the
-# definition found in the source code.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_AS_DEFINED      =
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all references to function-like macros that are alone on a line, have
-# an all uppercase name, and do not end with a semicolon. Such function macros
-# are typically used for boiler-plate code, and will confuse the parser if not
-# removed.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES tag can be used to specify one or more tag files. For each tag
-# file the location of the external documentation should be added. The format of
-# a tag file without this location is as follows:
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where loc1 and loc2 can be relative or absolute paths or URLs. See the
-# section "Linking to external documentation" for more information about the use
-# of tag files.
-# Note: Each tag file must have a unique name (where the name does NOT include
-# the path). If a tag file is not located in the directory in which doxygen is
-# run, you must also specify the path to the tagfile here.
-
-TAGFILES               =
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
-# tag file that is based on the input files it reads. See section "Linking to
-# external documentation" for more information about the usage of tag files.
-
-GENERATE_TAGFILE       =
-
-# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
-# the class index. If set to NO, only the inherited external classes will be
-# listed.
-# The default value is: NO.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will be
-# listed.
-# The default value is: YES.
-
-EXTERNAL_GROUPS        = YES
-
-# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
-# the related pages index. If set to NO, only the current project's pages will
-# be listed.
-# The default value is: YES.
-
-EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               =
-
-# If set to YES the inheritance and collaboration graphs will hide inheritance
-# and usage relations if the target is undocumented or is not a class.
-# The default value is: YES.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
-# Bell Labs. The other options in this section have no effect if this option is
-# set to NO
-# The default value is: NO.
-
-HAVE_DOT               = NO
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
-# to run in parallel. When set to 0 doxygen will base this on the number of
-# processors available in the system. You can set it explicitly to a value
-# larger than 0 to get control over the balance between CPU load and processing
-# speed.
-# Minimum value: 0, maximum value: 32, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_NUM_THREADS        = 0
-
-# When you want a differently looking font in the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTPATH           =
-
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
-# graph for each documented class showing the direct and indirect implementation
-# dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-COLLABORATION_GRAPH    = YES
-
-# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LOOK               = NO
-
-# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
-# class node. If there are many fields or methods and many nodes the graph may
-# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
-# number of items for each type to make the size more manageable. Set this to 0
-# for no limit. Note that the threshold may be exceeded by 50% before the limit
-# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
-# but if the number exceeds 15, the total amount of fields shown is limited to
-# 10.
-# Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LIMIT_NUM_FIELDS   = 10
-
-# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
-# collaboration graphs will show the relations between templates and their
-# instances.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-TEMPLATE_RELATIONS     = NO
-
-# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
-# YES then doxygen will generate a graph for each documented file showing the
-# direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDE_GRAPH          = YES
-
-# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
-# set to YES then doxygen will generate a graph for each documented file showing
-# the direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command. Disabling a call graph can be
-# accomplished by means of the command \hidecallgraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALL_GRAPH             = NO
-
-# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command. Disabling a caller graph can be
-# accomplished by means of the command \hidecallergraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALLER_GRAPH           = NO
-
-# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
-# hierarchy of all classes instead of a textual one.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
-# dependencies a directory has on other directories in a graphical way. The
-# dependency relations are determined by the #include relations between the
-# files in the directories.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot. For an explanation of the image formats see the section
-# output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
-# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
-# to make the SVG files visible in IE 9+ (other browsers do not have this
-# requirement).
-# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
-# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
-# png:gdiplus:gdiplus.
-# The default value is: png.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_IMAGE_FORMAT       = png
-
-# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
-# enable generation of interactive SVG images that allow zooming and panning.
-#
-# Note that this requires a modern browser other than Internet Explorer. Tested
-# and working are Firefox, Chrome, Safari, and Opera.
-# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
-# the SVG files visible. Older versions of IE do not have SVG support.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INTERACTIVE_SVG        = NO
-
-# The DOT_PATH tag can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_PATH               =
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the \dotfile
-# command).
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOTFILE_DIRS           =
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
-
-MSCFILE_DIRS           =
-
-# The DIAFILE_DIRS tag can be used to specify one or more directories that
-# contain dia files that are included in the documentation (see the \diafile
-# command).
-
-DIAFILE_DIRS           =
-
-# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
-
-PLANTUML_JAR_PATH      =
-
-# When using plantuml, the specified paths are searched for files specified by
-# the !include statement in a plantuml block.
-
-PLANTUML_INCLUDE_PATH  =
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
-# that will be shown in the graph. If the number of nodes in a graph becomes
-# larger than this value, doxygen will truncate the graph, which is visualized
-# by representing a node as a red box. Note that doxygen if the number of direct
-# children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
-# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-# Minimum value: 0, maximum value: 10000, default value: 50.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
-# generated by dot. A depth value of 3 means that only nodes reachable from the
-# root by following a path via at most 3 edges will be shown. Nodes that lay
-# further from the root node will be omitted. Note that setting this option to 1
-# or 2 may greatly reduce the computation time needed for large code bases. Also
-# note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-# Minimum value: 0, maximum value: 1000, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10) support
-# this, this feature is disabled by default.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
-# explaining the meaning of the various boxes and arrows in the dot generated
-# graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
-# files that are used to generate the various graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_CLEANUP            = YES
diff --git a/INSTALL b/INSTALL
deleted file mode 100644
index 8904b27cc..000000000
--- a/INSTALL
+++ /dev/null
@@ -1,227 +0,0 @@
-How to install ELPA
-===================
-
-First of all, if you do not want to build ELPA yourself, and you run Linux,
-it is worth having a look at the ELPA webpage http://elpa.rzg.mpg.de
-and/or the repositories of your Linux distribution: there exist
-pre-build packages for a number of Linux distributions like Fedora,
-Debian, and OpenSuse. More, will hopefully follow in the future.
-
-If you want to build (or have to since no packages are available) ELPA yourself,
-please note that ELPA is shipped with a typical "configure" and "make"
-autotools procedure. This is the only supported way how to build and install ELPA.
-
-If you obtained ELPA from the official git repository, you will not find
-the needed configure script! Please look at the "INSTALL_FROM_GIT_VERSION" file
-for the documentation how to proceed.
-
-
-If --- against our recommendations --- you do not want to install ELPA as
-library, or you do not want to use the autotools you will have to find a solution
-by yourself. This approach is NOT supported by us in any way and we strongly discourage
-this approach.
-
-If you do this, because you want to include ELPA within your code project (i.e. use
-ELPA not as an external library but "assimilate" it in your projects ),
-please distribute then _ALL_ files (as obtained from an official release tar-ball)
-of ELPA with your code. Note, however, that including the ELPA source files in your
-code projects (and not using ELPA as external library) is only allowed under the terms of
-the LGPL license if your code is ALSO under LGPL.
-
-
-(A): Installing ELPA as library with configure
-===================================================
-
-The configure installation is best done in four steps
-
-1) run configure:
-
-   Check the available options with "configure --help".
-   ELPA is shipped with several different versions of the
-   elpa2-kernel, each is optimized and tuned for a different
-   architecture.
-
-1.1) Choice of MPI compiler (wrappers)
-
-   It is mandatory that the C and C++ parts are compiled with the
-   GNU C, C++ compilers. Thus, all ELPA test programs which are written
-   in C must be compiled and linked with the MPI compiler (wrapper) which uses
-   the GNU C compilers.
-
-   The Fortran parts of ELPA can be compiled with any Fortran compiler of
-   your choice. It is for example quite common to compile the Fortran part
-   with the Intel Fortran compiler but the C,C++ parts must be compiled with
-   the GNU compilers.
-
-   Thus special care has to be taken that the correct MPI compiler (wrappers)
-   are found by the autotools! The configure script tries to find the correct
-   wrappers automatically, but sometimes it will fail.
-
-   In these cases it is necessary to set these compilers by hand:
-
-   ../configure FC=fortran_compiler_wrapper_of_your_choice CC=gnu_compiler_wrapper
-
-   will tell autotools which wrappers to take.
-
-
-
-1.2) Choice of ELPA2 kernels
-
-   With the release of ELPA (2014.06 or newer) it is _not_
-   mandatory any more to define the (real and complex) kernels
-   at build time. The configure procedure will build all the
-   kernels which can be used on the build system. The choice of
-   the kernels is now a run-time option. This is the most
-   convenient and also recommended way. It is intended to augment
-   this with an auto-tuning feature.
-
-   Nevertheless, one can still define at build-time _one_
-   specific kernel (for the real and the complex case each).
-   Then, ELPA is configured only with this real (and complex)
-   kernel, and all run-time checking is disabled. Have a look
-   at the "configure --help" messages and please refer to the
-   file "./src/elpa2_kernels/README_elpa2_kernels.txt".
-
-
-1.3) Setting up Blacs/Scalapack
-
-   The configure script tries to auto-detect an installed Blacs/Scalapack
-   library. If this is successful, you do not have to specify anything
-   in this regard. However, this will fail, if you do not use Netlib
-   Blacs/Scalapack but vendor specific implementations (e.g. Intel's MKL
-   library or the implementation of Cray and so forth...).
-
-   In this case, please  point to your Blacs/Scalapack installation and the
-   link-line with the variables "SCALAPACK_LDFLAGS" and "SCALAPACK_FCFLAGS".
-   "SCALAPACK_LDFLAGS" should  contain the correct link-line for your
-   Blacs/Scalapack installation and "SCALAPACK_FCFLAGS" the include path
-   and any other flags you need at compile time.
-
-   For example with Intel's MKL 11.2 library one might have to set
-
-   SCALAPACK_LDFLAGS="-L$MKLROOT/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 \
-                      -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 \
-		      -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64"
-   and
-
-   SCALAPACK_FCFLAGS="-I$MKLROOT/include/intel64/lp64"
-
-   Note, that the actual MKL linkline depends on the installed MKL version.
-
-   If your libraries are in non-standard locations, you can think about
-   specifying a runtime library search path ("rpath") in the link-line,
-   otherwise it will be necessary to update the LD_LIBRARY_PATH environment
-   variable.
-
-   In any case, auto-detection or manual specification of Blacs/Scalapack,
-   the configure procedure will check whether Blacs/Scalapack is available
-   at build-time and try to link with it.
-
-1.4) Setting optimizations
-
-
-   Please set the optimisation that you prefer with the
-   variable "FCFLAGS", "CFLAGS", and "CXXFLAGS",
-   please see "./src/elpa2_kernels/README_elpa2_kernels.txt".
-
-   Note that _NO_ compiler optimization flags are set automatically. It
-   is always adviced to set them by e.g.:
-
-   ./configure CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O2"
-
-   Note that it is mandatory to set optimization flags for C, C++, and Fortran
-   since ELPA uses source files and compile steps from all these languages.
-
-   Also note that building of the SSE and AVX kernels, requires
-   compilation with the GNU C Compiler (gcc). It is advised to
-   set also CFLAGS="-march=native" CXXFLAGS="-march=native",
-   since otherwise the GNU compiler does not support AVX, even
-   if the hardware does. If you already included "-mAVX" in the
-   flags, you can skip "-march=native".
-
-   If you want to use the newer AVX2 instructions, assuming they are supported on
-   your hardware, please set CFLAGS="-march=avx2 -mfma" and CXXFLAGS="-march=avx2 -mfma".
-
-
-   Setting the optimization flags for the AVX kernels can be a hassle. If AVX
-   kernels are build for your system, you can set the configure option
-   "--with-avx-optimizations=yes". This will automatically set a few compiler
-   optimization flags which turned out to be beneficial for AVX support.
-   However, it might be that on your system/compiler version etc. other flags
-   are the better choice. AND this does _not_ set the above mentioned flags,
-   which you should still set by hand:
-
-   ./configure CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O2"
-
-   An istallation with AVX2 and best-optimizations could thus look like this:
-
-   ./configure  CFLAGS="-O2 -mavx2 -mfma" CXXFLAGS="-O2 -mavx2 -mfma" FCFLAGS="-O2" --with-avx-optimization
-
-1.5) Installation location
-
-   Set the "--prefix" flag if you wish another installation location than
-   the default "/usr/local/".
-
-1.6) Hybrid OpenMP support
-
-   If you want to use the hybrid MPI/OpenMP version of ELPA please specify
-   "--enable-openmp". Note that the ELPA library will then contain a "_mt" in
-   it's name to indicate multi threading support.
-
-1.7) Other
-
-   Note, that at the moment we do not officially support "cross compilation"
-   although it should work.
-
-2) run "make"
-
-3) run "make check"
-
-   A simple test of ELPA is done. At the moment the usage of "mpiexec"
-   is required. If this is not possible at your system, you can run the
-   binaries
-
-     elpa1_test_real
-     elpa2_test_real
-     elpa1_test_complex
-     elpa2_test_complex
-     elpa2_test_complex_default_kernel
-     elpa2_test_complex_choose_kernel_with_api
-     elpa2_test_real_default_kernel
-     elpa2_test_real_choose_kernel_with_api
-
-   yourself. At the moment the tests check whether the residual and the
-   orthogonality of the found eigenvectors are lower than a threshold of
-   5e-12. If this test fails, or if you believe the threshold should be
-   even lower, please talk to us. Furthermore, your run-time choice of
-   ELPA kernels is tested. This is intended as a help to get used to this
-   new feature. With the same thought in mind, a binary "elpa2_print_kernels"
-   is provided, which is rather self-explanatory.
-
-   Also some of the above mentioned tests are provided as C source files.
-   These should demonstrate, how to call ELPA from a C program (i.e. which headers to include
-   and call the ELPA functions). They are NOT intended as a copy and paste solution!
-
-
-4) run "make install"
-
-   Note that a pkg-config file for ELPA is produced. You should then be
-   able to link the ELPA library to your own applications.
-
-
-B) Installing ELPA without the autotools procedure
-===================================================
-
-   We do not support installation without the autotools anymore!
-   If you think you need this, sorry, but then you are on your own.
-
-
-How to use ELPA
-===============
-
-Using ELPA should be quite simple. It is similar to ScalaPack but the API
-is different. See the examples in the directory "./test". There it is shown how
-to evoke ELPA from a Fortran code.
-
-If you installed ELPA, a pkg-config file is produced which will tell you how to
-link your own program with ELPA.
diff --git a/INSTALL_FROM_GIT_VERSION b/INSTALL_FROM_GIT_VERSION
deleted file mode 100644
index d0cec4130..000000000
--- a/INSTALL_FROM_GIT_VERSION
+++ /dev/null
@@ -1,19 +0,0 @@
-Welcome to the git-based distribution of the ELPA eigensolver library.
-
-If you are reading this file, you have obtained the ELPA library
-through the git repository that hosts the source code and also allows
-you to contribute improvements to the project if necessary.
-
-The git version does not contain the necessary build script:
-configure, Makefile ...
-
-If you use the git version, you are most likely actively developing
-features/improvements for ELPA, and a rebuild of the autotools scripts
-will be necessary anyway.
-
-Thus please run "autogen.sh" after your changes, in order to build the
-autotoos scripts. Note that autoconf version >= 2.69 is needed for
-ELPA.
-
-After this step, please proceed as written in the "INSTALL" file.
-
diff --git a/LIBRARY_INTERFACE b/LIBRARY_INTERFACE
deleted file mode 100644
index ae4e4fcc7..000000000
--- a/LIBRARY_INTERFACE
+++ /dev/null
@@ -1,34 +0,0 @@
-Libtool interface number history (the "c" in [c:r:a] of
-https://www.gnu.org/software/libtool/manual/html_node/Updating-version-info.html).
-
-- 0
-  Legacy interface number for all releases prior to 2014.06
-
-- 1
-  Incompatible API change. Most subroutines were converted into functions
-  returning a success flag. Previously, the library called exit() on error
-  conditions.
-
-  The state of release 2014.06.001 defines this interface
-
-- 2
-  Incompatible API change. The routine for ELPA 2 real case takes an
-  optional argument, defining whether QR decomposition is used or not
-
-  The state of release 2015.02.001 defines this interface
-
-- 3
-  Incompatible API change. C Functions are added to provide an C interface.
-  The subroutine "get_elpa_row_col_comms" has been converted to a function
-  which returns an error code.
-
-  The state of release 2015.02.002 defines this interface
-
-- 4
-  Incompatible API change. The Fortran versions of the library functions return
-  a logical indicating success or failure. The C versions of the library functions
-  return an integer {0,1} for success and failure.
-  Some interface need an extra argument, specifying the number of matrix columns
-  (see the documentation)
-
-
diff --git a/Makefile.am b/Makefile.am
deleted file mode 100644
index 435982eff..000000000
--- a/Makefile.am
+++ /dev/null
@@ -1,383 +0,0 @@
-## Process this file with automake to produce Makefile.in
-
-ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4
-
-AM_FCFLAGS = $(SCALAPACK_FCFLAGS) @FC_MODINC@modules @FC_MODOUT@modules
-AM_LDFLAGS = $(SCALAPACK_LDFLAGS)
-
-# libelpa
-lib_LTLIBRARIES = libelpa@SUFFIX@.la
-libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION) -lstdc++
-
-libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \
-                             src/elpa_utilities.F90 \
-	                     src/elpa1_compute.F90 \
-	                     src/elpa1.F90 \
-			     src/elpa2_utilities.F90 \
-			     src/mod_pack_unpack_real.F90 \
-			     src/elpa2_kernels/mod_single_hh_trafo_real.F90 \
-			     src/mod_compute_hh_trafo_real.F90 \
-			     src/mod_compute_hh_trafo_complex.F90 \
-			     src/mod_pack_unpack_complex.F90 \
-			     src/elpa2_compute.F90 \
-			     src/elpa2.F90 \
-			     src/elpa_c_interface.F90 \
-                             src/elpa_qr/qr_utils.f90 \
-                             src/elpa_qr/elpa_qrkernels.f90 \
-                             src/elpa_qr/elpa_pdlarfb.f90 \
-                             src/elpa_qr/elpa_pdgeqrf.F90
-if HAVE_DETAILED_TIMINGS
-  libelpa@SUFFIX@_la_SOURCES += src/timer.F90 \
-                                src/ftimings/ftimings.F90 \
-                                src/ftimings/ftimings_type.F90 \
-                                src/ftimings/ftimings_value.F90 \
-                                src/ftimings/highwater_mark.c \
-                                src/ftimings/resident_set_size.c \
-                                src/ftimings/time.c \
-                                src/ftimings/virtual_memory.c \
-                                src/ftimings/papi.c
-endif
-
-if WITH_REAL_GENERIC_KERNEL
-  libelpa@SUFFIX@_la_SOURCES +=  src/elpa2_kernels/elpa2_kernels_real.F90
-endif
-
-if WITH_COMPLEX_GENERIC_KERNEL
-  libelpa@SUFFIX@_la_SOURCES +=  src/elpa2_kernels/elpa2_kernels_complex.F90
-endif
-
-if WITH_REAL_GENERIC_SIMPLE_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_simple.F90
-endif
-
-if WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.F90
-endif
-
-if WITH_REAL_BGP_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90
-endif
-
-if WITH_REAL_BGQ_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90
-endif
-
-if WITH_REAL_SSE_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
-else
-if WITH_COMPLEX_SSE_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
-endif
-endif
-
-if WITH_REAL_AVX_BLOCK2_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
-endif
-
-if WITH_REAL_AVX_BLOCK4_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
-endif
-
-if WITH_REAL_AVX_BLOCK6_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
-endif
-
-if WITH_COMPLEX_AVX_BLOCK1_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
-endif
-
-if WITH_COMPLEX_AVX_BLOCK2_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
-endif
-
-#if WITH_AVX_SANDYBRIDGE
-#  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
-#                                           src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
-#endif
-
-# install any .mod files in the include/ dir
-elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@
-nobase_elpa_include_HEADERS = $(wildcard modules/*)
-nobase_elpa_include_HEADERS += elpa/elpa.h elpa/elpa_kernel_constants.h elpa/elpa_generated.h
-
-man_MANS = man/solve_evp_real.3 \
-	   man/solve_evp_real_1stage.3 \
-	   man/solve_evp_complex.3 \
-	   man/solve_evp_complex_1stage.3 \
-	   man/solve_evp_real_2stage.3 \
-	   man/solve_evp_complex_2stage.3 \
-	   man/get_elpa_row_col_comms.3 \
-	   man/get_elpa_communicators.3 \
-	   man/print_available_elpa2_kernels.1
-
-# other files to distribute
-filesdir = $(docdir)/examples
-dist_files_DATA = \
-  test/fortran_test_programs/read_real.F90 \
-  test/fortran_test_programs/test_complex2.F90 \
-  test/fortran_test_programs/test_complex2_default_kernel.F90 \
-  test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 \
-  test/fortran_test_programs/test_complex.F90 \
-  test/fortran_test_programs/test_real2.F90 \
-  test/fortran_test_programs/test_real2_default_kernel.F90 \
-  test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90 \
-  test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 \
-  test/fortran_test_programs/test_real.F90 \
-  test/fortran_test_programs/test_real_with_c.F90 \
-  src/print_available_elpa2_kernels.F90
-
-dist_doc_DATA = README COPYING/COPYING COPYING/gpl.txt COPYING/lgpl.txt
-
-# pkg-config stuff
-pkgconfigdir = $(libdir)/pkgconfig
-pkgconfig_DATA = @PKG_CONFIG_FILE@
-
-# programs
-bin_PROGRAMS = \
-  elpa1_test_real@SUFFIX@ \
-  elpa1_test_complex@SUFFIX@ \
-  elpa2_test_real@SUFFIX@ \
-  elpa2_test_complex@SUFFIX@ \
-  elpa2_print_kernels@SUFFIX@
-
-noinst_PROGRAMS = \
-  elpa2_test_real_default_kernel@SUFFIX@ \
-  elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@ \
-  elpa2_test_complex_default_kernel@SUFFIX@ \
-  elpa2_test_real_choose_kernel_with_api@SUFFIX@ \
-  elpa2_test_complex_choose_kernel_with_api@SUFFIX@ \
-  elpa1_test_real_c_version@SUFFIX@ \
-  elpa1_test_complex_c_version@SUFFIX@ \
-  elpa2_test_real_c_version@SUFFIX@ \
-  elpa2_test_complex_c_version@SUFFIX@ \
-  elpa1_test_real_with_c@SUFFIX@
-
-
-build_lib = libelpa@SUFFIX@.la
-
-if HAVE_REDIRECT
-  redirect_sources = test/shared_sources/redir.c test/shared_sources/redirect.F90
-else
-  redirect_sources =
-endif
-
-shared_sources = test/shared_sources/util.F90 test/shared_sources/read_input_parameters.F90  test/shared_sources/check_correctnes.F90 test/shared_sources/setup_mpi.F90 \
-		 test/shared_sources/blacs_infrastructure.F90 test/shared_sources/prepare_matrix.F90
-
-elpa1_test_real_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa1_test_real_c_version.c $(shared_sources) $(redirect_sources)
-elpa1_test_real_c_version@SUFFIX@_LDADD = $(build_lib)
-elpa1_test_real_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS)
-
-elpa1_test_complex_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa1_test_complex_c_version.c $(shared_sources) $(redirect_sources)
-elpa1_test_complex_c_version@SUFFIX@_LDADD = $(build_lib)
-elpa1_test_complex_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS)
-
-elpa2_test_real_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa2_test_real_c_version.c $(shared_sources) $(redirect_sources)
-elpa2_test_real_c_version@SUFFIX@_LDADD = $(build_lib)
-elpa2_test_real_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS)
-
-elpa2_test_complex_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa2_test_complex_c_version.c $(shared_sources) $(redirect_sources)
-elpa2_test_complex_c_version@SUFFIX@_LDADD = $(build_lib)
-elpa2_test_complex_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS)
-
-
-elpa1_test_real@SUFFIX@_SOURCES = test/fortran_test_programs/test_real.F90 $(shared_sources) $(redirect_sources)
-elpa1_test_real@SUFFIX@_LDADD = $(build_lib)
-
-elpa1_test_real_with_c@SUFFIX@_SOURCES = test/fortran_test_programs/test_real_with_c.F90 test/shared_sources/mod_from_c.F90 test/shared_sources/call_elpa1.c $(shared_sources) $(redirect_sources)
-elpa1_test_real_with_c@SUFFIX@_LDADD = $(build_lib)
-
-#elpa1_test_complex_with_c@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex_with_c.F90 test/shared_sources/mod_from_c.F90 test/shared_sources/call_elpa1.c $(shared_sources) $(redirect_sources)
-#elpa1_test_complex_with_c@SUFFIX@_LDADD = $(build_lib)
-
-elpa2_test_real@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2.F90 $(shared_sources) $(redirect_sources)
-
-elpa2_test_real@SUFFIX@_LDADD = $(build_lib)
-
-elpa2_test_real_default_kernel@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel.F90 $(shared_sources) $(redirect_sources)
-
-elpa2_test_real_default_kernel@SUFFIX@_LDADD = $(build_lib)
-
-elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90 $(shared_sources) $(redirect_sources)
-
-elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_LDADD = $(build_lib)
-
-elpa2_test_real_choose_kernel_with_api@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 $(shared_sources) $(redirect_sources)
-
-elpa2_test_real_choose_kernel_with_api@SUFFIX@_LDADD = $(build_lib)
-
-elpa1_test_complex@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex.F90 $(shared_sources) $(redirect_sources)
-
-elpa1_test_complex@SUFFIX@_LDADD = $(build_lib)
-
-elpa2_test_complex@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2.F90 $(shared_sources) $(redirect_sources)
-
-elpa2_test_complex@SUFFIX@_LDADD = $(build_lib)
-
-elpa2_test_complex_default_kernel@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_default_kernel.F90 $(shared_sources) $(redirect_sources)
-
-elpa2_test_complex_default_kernel@SUFFIX@_LDADD = $(build_lib)
-
-elpa2_test_complex_choose_kernel_with_api@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 $(shared_sources) $(redirect_sources)
-
-elpa2_test_complex_choose_kernel_with_api@SUFFIX@_LDADD = $(build_lib)
-
-elpa2_print_kernels@SUFFIX@_SOURCES = src/print_available_elpa2_kernels.F90 $(shared_sources) $(redirect_sources)
-
-elpa2_print_kernels@SUFFIX@_LDADD = $(build_lib)
-
-check_SCRIPTS = \
-  elpa1_test_real.sh \
-  elpa1_test_real_with_c.sh \
-  elpa1_test_real_c_version.sh \
-  elpa1_test_complex_c_version.sh \
-  elpa2_test_real_c_version.sh \
-  elpa2_test_complex_c_version.sh \
-  elpa2_test_real.sh \
-  elpa2_test_real_default_kernel.sh \
-  elpa1_test_complex.sh \
-  elpa2_test_complex.sh \
-  elpa2_test_complex_default_kernel.sh \
-  elpa2_test_real_default_kernel_qr_decomposition.sh \
-  elpa2_test_real_choose_kernel_with_api.sh \
-  elpa2_test_complex_choose_kernel_with_api.sh \
-  elpa2_print_kernels@SUFFIX@
-
-TESTS = $(check_SCRIPTS)
-elpa1_test_real.sh:
-	echo 'mpiexec -n 2 ./elpa1_test_real@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real.sh
-	chmod +x elpa1_test_real.sh
-
-elpa1_test_real_with_c.sh:
-	echo 'mpiexec -n 2 ./elpa1_test_real_with_c@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real_with_c.sh
-	chmod +x elpa1_test_real_with_c.sh
-
-elpa2_test_real_c_version.sh:
-	echo 'mpiexec -n 2 ./elpa2_test_real_c_version@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_c_version.sh
-	chmod +x elpa2_test_real_c_version.sh
-
-elpa2_test_complex_c_version.sh:
-	echo 'mpiexec -n 2 ./elpa2_test_complex_c_version@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_c_version.sh
-	chmod +x elpa2_test_complex_c_version.sh
-
-elpa1_test_real_c_version.sh:
-	echo 'mpiexec -n 2 ./elpa1_test_real_c_version@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real_c_version.sh
-	chmod +x elpa1_test_real_c_version.sh
-
-elpa1_test_complex_c_version.sh:
-	echo 'mpiexec -n 2 ./elpa1_test_complex_c_version@SUFFIX@ $$TEST_FLAGS' > elpa1_test_complex_c_version.sh
-	chmod +x elpa1_test_complex_c_version.sh
-elpa2_test_real.sh:
-	echo 'mpiexec -n 2 ./elpa2_test_real@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real.sh
-	chmod +x elpa2_test_real.sh
-
-elpa2_test_real_default_kernel.sh:
-	echo 'mpiexec -n 2 ./elpa2_test_real_default_kernel@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_default_kernel.sh
-	chmod +x elpa2_test_real_default_kernel.sh
-
-elpa2_test_real_default_kernel_qr_decomposition.sh:
-	echo 'mpiexec -n 2 ./elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@' > elpa2_test_real_default_kernel_qr_decomposition.sh
-	chmod +x elpa2_test_real_default_kernel_qr_decomposition.sh
-
-elpa2_test_real_choose_kernel_with_api.sh:
-	echo 'mpiexec -n 2 ./elpa2_test_real_choose_kernel_with_api@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_choose_kernel_with_api.sh
-	chmod +x elpa2_test_real_choose_kernel_with_api.sh
-
-elpa1_test_complex.sh:
-	echo 'mpiexec -n 2 ./elpa1_test_complex@SUFFIX@ $$TEST_FLAGS' > elpa1_test_complex.sh
-	chmod +x elpa1_test_complex.sh
-
-elpa2_test_complex.sh:
-	echo 'mpiexec -n 2 ./elpa2_test_complex@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex.sh
-	chmod +x elpa2_test_complex.sh
-
-elpa2_test_complex_default_kernel.sh:
-	echo 'mpiexec -n 2 ./elpa2_test_complex_default_kernel@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_default_kernel.sh
-	chmod +x elpa2_test_complex_default_kernel.sh
-
-elpa2_test_complex_choose_kernel_with_api.sh:
-	echo 'mpiexec -n 2 ./elpa2_test_complex_choose_kernel_with_api@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_choose_kernel_with_api.sh
-	chmod +x elpa2_test_complex_choose_kernel_with_api.sh
-
-elpa2_utilities.i: $(top_srcdir)/src/elpa2_utilities.F90
-	$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_utilities.F90 -o $@
-
-elpa2.i: $(top_srcdir)/src/elpa2.F90
-	$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2.F90 -o $@
-
-elpa1.i: $(top_srcdir)/src/elpa1.F90
-	$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@
-
-elpa2_kernels_real.i: $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90
-	$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90 -o $@
-
-mod_compute_hh_trafo_real.i: $(top_srcdir)/src/mod_compute_hh_trafo_real.F90
-	$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_real.F90 -o $@
-
-include doxygen.am
-
-CLEANFILES = \
-  elpa-generated.h \
-  elpa1_test_real.sh \
-  elpa1_test_complex.sh \
-  elpa2_test_real.sh \
-  elpa2_test_real_default_kernel.sh \
-  elpa2_test_real_default_kernel_qr_decomposition.sh \
-  elpa2_test_complex.sh \
-  elpa2_test_complex_default_kernel.sh \
-  elpa2_test_real_choose_kernel_with_api.sh \
-  elpa2_test_complex_choose_kernel_with_api.sh \
-  elpa1_test_real_with_c.sh \
-  elpa1_test_real_c_version.sh \
-  elpa1_test_complex_c_version.sh \
-  elpa2_test_real_c_version.sh \
-  elpa2_test_complex_c_version.sh \
-  *.i
-
-clean-local:
-	-rm -rf modules/* .fortran_dependencies/*
-
-distclean-local:
-	-rm config-f90.h
-	-rm -rf ./src/elpa2_kernels/.deps
-	-rm -rf ./src/.deps
-	-rm -rf ./test/.deps
-	-rm -rf ./elpa/elpa_generated.h
-	-rmdir ./src/elpa2_kernels/
-	-rmdir ./src
-	-rmdir ./test
-	-rmdir ./m4
-	-rmdir modules/
-	-rmdir .fortran_dependencies/
-
-EXTRA_DIST = \
-  fdep/fortran_dependencies.pl \
-  fdep/fortran_dependencies.mk \
-  src/elpa_reduce_add_vectors.X90 \
-  src/elpa_transpose_vectors.X90 \
-  src/redist_band.X90
-
-# Rules to re-generated the headers
-elpa/elpa_generated.h: $(top_srcdir)/src/elpa_c_interface.F90
-	grep -h "^ *!c>" $^ | sed 's/^ *!c>//;' > $@ || { rm $@; exit 1; }
-
-test/shared_sources/generated.h: $(wildcard $(top_srcdir)/test/shared_sources/*.F90)
-	grep -h "^ *!c>" $^ | sed 's/^ *!c>//;' > $@ || { rm $@; exit 1; }
-
-LIBTOOL_DEPS = @LIBTOOL_DEPS@
-libtool: $(LIBTOOL_DEPS)
-	$(SHELL) ./config.status libtool
-
-
-@FORTRAN_MODULE_DEPS@
-
-# Fortran module dependencies only work within each target,
-# specify that the test programs need a finished library before
-# one can compile them
-
-# $1  Object name
-define require_elpa_lib
-$1: libelpa@SUFFIX@.la
-
-endef
-$(foreach p,$(bin_PROGRAMS) $(noinst_PROGRAMS),$(foreach o,$($p_OBJECTS),$(eval $(call require_elpa_lib,$o))))
diff --git a/README b/README
deleted file mode 100644
index 26475a0c3..000000000
--- a/README
+++ /dev/null
@@ -1,89 +0,0 @@
-If you obtained ELPA via the official git repository please have
-a look at the "INSTALL_FROM_GIT_VERSION" for specific instructions
-
-In your use of ELPA, please respect the copyright restrictions
-found below and in the "COPYING" directory in this repository. In a
-nutshell, if you make improvements to ELPA, copyright for such
-improvements remains with you, but we request that you relicense any
-such improvements under the same exact terms of the (modified) LGPL v3
-that we are using here. Please do not simply absorb ELPA into your own
-project and then redistribute binary-only without making your exact
-version of the ELPA source code (unmodified or MODIFIED) available as
-well.
-
-
-*** Citing:
-
-  A description of some algorithms present in ELPA can be found in:
-
-  T. Auckenthaler, V. Blum, H.-J. Bungartz, T. Huckle, R. Johanni,
-  L. Kr\"amer, B. Lang, H. Lederer, and P. R. Willems,
-  "Parallel solution of partial symmetric eigenvalue problems from
-  electronic structure calculations",
-  Parallel Computing 37, 783-794 (2011).
-  doi:10.1016/j.parco.2011.05.002.
-
-  Marek, A.; Blum, V.; Johanni, R.; Havu, V.; Lang, B.; Auckenthaler,
-  T.; Heinecke, A.; Bungartz, H.-J.; Lederer, H.
-  "The ELPA library: scalable parallel eigenvalue solutions for electronic
-  structure theory and computational science",
-  Journal of Physics Condensed Matter, 26 (2014)
-  doi:10.1088/0953-8984/26/21/213201
-
-  Please cite this paper when using ELPA. We also intend to publish an
-  overview description of the ELPA library as such, and ask you to
-  make appropriate reference to that as well, once it appears.
-
-
-*** Copyright:
-
-Copyright of the original code rests with the authors inside the ELPA
-consortium. The code is distributed under the terms of the GNU Lesser General
-Public License version 3 (LGPL).
-
-Please also note the express "NO WARRANTY" disclaimers in the GPL.
-
-Please see the file "COPYING" for details, and the files "gpl.txt" and
-"lgpl.txt" for further information.
-
-
-*** Using ELPA:
-
-ELPA is designed to be compiled (Fortran) on its own, to be later
-linked to your own application. In order to use ELPA, you must still
-have a set of separate libraries that provide
-
-  - Basic Linear Algebra Subroutines (BLAS)
-  - Lapack routines
-  - Basic Linear Algebra Communication Subroutines (BLACS)
-  - Scalapack routines
-  - a working MPI library
-
-Appropriate libraries can be obtained and compiled separately on many
-architectures as free software. Alternatively, pre-packaged libraries
-are usually available from any HPC proprietary compiler vendors.
-
-For example, Intel's ifort compiler contains the "math kernel library"
-(MKL), providing BLAS/Lapack/BLACS/Scalapack functionality. (except on
-Mac OS X, where the BLACS and Scalapack part must still be obtained
-and compiled separately).
-
-A very usable general-purpose MPI library is OpenMPI (ELPA was tested
-with OpenMPI 1.4.3 for example). Intel MPI seems to be a very well
-performing option on Intel platforms.
-
-Examples of how to use ELPA are included in the accompanying
-test_*.f90 subroutines in the "test" directory. An example makefile
-"Makefile.example" is also included as a minimal example of how to
-build and link ELPA to any other piece of code. In general, however,
-we suggest to use the build environment in order to install ELPA
-as library to your system.
-
-
-*** Structure of this repository:
-
-As in most git repositories, also this repository contains different branches.
-The branch "master" is always identical to the one representing the latest release
-of ELPA. All other branches, either represent development work, or previous releases of
-ELPA.
-.
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
deleted file mode 100644
index a3da56730..000000000
--- a/RELEASE_NOTES
+++ /dev/null
@@ -1,32 +0,0 @@
-This file contains the release notes for the ELPA 2015.11.001 version
-
-
-
-What is new?
--------------
-
-a) ABI change
----------------------
-
-Most importantly, the ABI of the ELPA libray changed!
-A rebuild/relink of the user code using the ELPA library is mandatory!
-Hopefully, this will be the last ABI change for some time.
-
-b) C interface
-----------------------
-
-ELPA now is shipped with a C interface to directly call the ELPA library (written in Fortran)
-from C code. Header files are provided to declare the c functions.
-Since ELPA is still a Fortran library it might be necessary to link it together with the needed
-Fortran runtime libraries in your C code.
-
-
-
-Any incompatibles to previous version?
----------------------------------------
-
-As mentioned before, the ABI of ELPA has changed! It will be necessary
-to rebuild the programs using ELPA, if this new version should be used.
-Beware, using the new library with code which was build with an older verion
-should not even run. If it does, the results will be wrong !
-
diff --git a/THIS_REPO_HAS_MOVED b/THIS_REPO_HAS_MOVED
new file mode 100644
index 000000000..0773211cb
--- /dev/null
+++ b/THIS_REPO_HAS_MOVED
@@ -0,0 +1,14 @@
+Dear user,
+
+this git repo has been moved to
+
+  https://gitlab.mpcdf.mpg.de/elpa/elpa.git
+
+To update your working copies to use the new location, do
+
+  git remote set-url origin https://gitlab.mpcdf.mpg.de/elpa/elpa.git
+  git pull
+
+Please note that this commit is only meant to notify you, the previous history
+of the project is still contained in this repository up to the point of the
+migration.
diff --git a/autogen.sh b/autogen.sh
deleted file mode 100755
index c62236285..000000000
--- a/autogen.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/sh -e
-
-mkdir -p m4/
-
-test -n "$srcdir" || srcdir=`dirname "$0"`
-test -n "$srcdir" || srcdir=.
-
-autoreconf --force --install --verbose "$srcdir"
diff --git a/configure.ac b/configure.ac
deleted file mode 100644
index 847865ccc..000000000
--- a/configure.ac
+++ /dev/null
@@ -1,749 +0,0 @@
-AC_PREREQ([2.69])
-
-AC_INIT([elpa],[2015.11.001], [elpa-library@mpcdf.mpg.de])
-AC_SUBST([PACKAGE_VERSION])
-
-AC_CONFIG_SRCDIR([src/elpa1.F90])
-
-AM_INIT_AUTOMAKE([foreign -Wall subdir-objects])
-
-# Without this, automake tries to be smart and rebuilt
-# the autoconf generated files such as configure, aclocal.m4, etc.,
-# in case the timestamps of files such as configure.ac are newer
-#
-# This only makes trouble for end users with out-of-date autoconf versions
-# that cannot produce these files
-AM_MAINTAINER_MODE([disable])
-
-AC_CONFIG_MACRO_DIR([m4])
-AC_CONFIG_HEADERS([config.h])
-AM_SILENT_RULES([yes])
-
-rm -rf config.h config-f90.h
-
-# Set the libtool library version, see LIBRARY_INTERFACE
-#
-# See http://www.gnu.org/software/libtool/manual/html_node/Updating-version-info.html
-#
-# [c:r:a]
-#
-# c: The currently implement interface
-# r: The revision number of the current interface, that is the number
-#    of released source-code changes for the current interface
-# a: The "age" is the number of interfaces prior to c that are also supported
-#    by the current interface, as they are ABI compatible (e.g. only new symbols
-#    were added by the new interface)
-#
-AC_SUBST([ELPA_SO_VERSION], [5:0:1])
-#
-
-
-AX_CHECK_GNU_MAKE()
-if test x$_cv_gnu_make_command = x ; then
-        AC_MSG_ERROR([Need GNU Make])
-fi
-
-AC_CHECK_PROG(CPP_FOUND,cpp,yes,no)
-if test x"${CPP_FOUND}" = xno; then
-  AC_MSG_ERROR([no cpp found])
-fi
-
-# gnu-make fortran module dependencies
-m4_include([fdep/fortran_dependencies.m4])
-FDEP_F90_GNU_MAKE_DEPS
-
-###
-
-m4_include([m4/ax_elpa_openmp.m4])
-
-AC_MSG_CHECKING(whether --enable-openmp is specified)
-AC_ARG_ENABLE([openmp],
-		AS_HELP_STRING([--enable-openmp],
-			       [use OpenMP threading, default no.]),
-	      [],
-	      [enable_openmp=no])
-AC_MSG_RESULT([${enable_openmp}])
-AM_CONDITIONAL([WITH_OPENMP],[test x"$enable_openmp" = x"yes"])
-if test x"${enable_openmp}" = x"yes"; then
-	AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
-fi
-
-dnl check whether mpi compilers are available;
-dnl if not abort since it is mandatory
-
-# C
-AC_LANG([C])
-m4_include([m4/ax_prog_cc_mpi.m4])
-AX_PROG_CC_MPI([true],[],[AC_MSG_ERROR([no MPI C wrapper found])])
-if test x"${enable_openmp}" = x"yes"; then
-	AX_ELPA_OPENMP
-	if test "$ac_cv_prog_cc_openmp" = unsupported; then
-	  AC_MSG_ERROR([Could not compile a C program with OpenMP, adjust CFLAGS])
-	fi
-	CFLAGS="$OPENMP_CFLAGS $CFLAGS"
-fi
-
-AC_PROG_INSTALL
-AM_PROG_AR
-AM_PROG_AS
-
-
-# Fortran
-AC_LANG([Fortran])
-m4_include([m4/ax_prog_fc_mpi.m4])
-AX_PROG_FC_MPI([],[],[AC_MSG_ERROR([no MPI Fortran wrapper found])])
-
-if test x"${enable_openmp}" = x"yes"; then
-	AX_ELPA_OPENMP
-	if test "$ac_cv_prog_fc_openmp" = unsupported; then
-	  AC_MSG_ERROR([Could not compile a Fortran program with OpenMP, adjust FCFLAGS])
-	fi
-	FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
-fi
-
-# C++
-AC_LANG([C++])
-AC_PROG_CXX
-
-if test x"${enable_openmp}" = x"yes"; then
-	AX_ELPA_OPENMP
-	if test "$ac_cv_prog_cxx_openmp" = unsupported; then
-	  AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
-	fi
-	CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
-fi
-
-
-
-dnl variables needed for the tests
-
-dnl do NOT remove any variables here, until
-dnl 1. you know 100% what you are doing
-dnl 2. you tested ALL configure functionality afterwards
-dnl Otherwise, you most likely break some functionality
-
-dnl as default always define the generic kernels to be build
-dnl this is only unset if gpu_support_only is defined, or
-dnl other specific real/complex kernels are wanted
-
-install_real_generic=yes
-install_real_generic_simple=yes
-
-install_complex_generic=yes
-install_complex_generic_simple=yes
-
-AC_LANG([C])
-
-dnl build with ftimings support
-AC_MSG_CHECKING(whether ELPA should be build with ftimings support)
-AC_ARG_WITH([ftimings],
-		AS_HELP_STRING([--with-ftimings],
-			       [detailed timings, default no.]),
-	      [with_ftimings=yes],
-	      [with_ftimings=no])
-AC_MSG_RESULT([${with_ftimings}])
-
-dnl build with the possibilty to redirect stdout and stderr
-dnl per MPI task in a file
-AC_MSG_CHECKING(whether stdout/stderr file redirect should be enabled)
-AC_ARG_WITH([redirect],
-		AS_HELP_STRING([--with-redirect],
-			       [for test programs, allow redirection of stdout/stderr per MPI taks in a file (useful for ftimings), default no.]),
-	      [with_redirect=yes],
-	      [with_redirect=no])
-AC_MSG_RESULT([${with_redirect}])
-
-if test x"${with_redirect}" = x"yes"; then
-  AC_DEFINE([HAVE_REDIRECT], [1], [Redirect stdout and stderr of test programs per MPI tasks to a file])
-fi
-AM_CONDITIONAL([HAVE_REDIRECT],[test x"$with_redirect" = x"yes"])
-
-if test x"${with_ftimings}" = x"yes"; then
-  AC_DEFINE([HAVE_DETAILED_TIMINGS], [1], [Enable more timings])
-  AC_ARG_ENABLE([papi],
-	[AS_HELP_STRING([--disable-papi],[Do not use PAPI to also measure flop count, autodetected by default])],
-	[want_papi=$enableval],[want_papi="auto"])
-  papi_found=unknown
-  if test x"$want_papi" != x"no" ; then
-    AC_CHECK_LIB([papi],[PAPI_library_init],[papi_found="yes"],[papi_found="no"])
-    if test x"$want_papi" = x"yes" ; then
-      if test x"$papi_found" = x"no" ; then
-        AC_MSG_ERROR(["Could not find usable PAPI installation, please adjust CFLAGS, LDFLAGS"])
-      fi
-    fi
-  fi
-  if test x"$papi_found" = x"yes"; then
-    AC_DEFINE([HAVE_LIBPAPI], [1], [Use the PAPI library])
-    LIBS="-lpapi $LIBS"
-  fi
-fi
-AM_CONDITIONAL([HAVE_DETAILED_TIMINGS],[test x"$with_ftimings" = x"yes"])
-
-AC_MSG_CHECKING(whether SSE assembler kernel can be compiled)
-
-$CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s -o test.o 2>/dev/null
-if test "$?" == 0; then
-  can_compile_sse=yes
-  install_real_sse=yes
-  install_complex_sse=yes
-else
-  can_compile_sse=no
-  install_real_sse=no
-  install_complex_sse=no
-fi
-rm -f ./test.o
-AC_MSG_RESULT([${can_compile_sse}])
-
-dnl check whether one can compile with avx - gcc intrinsics
-
-dnl first pass: try with specified CFLAGS and CXXFLAGS
-AC_MSG_CHECKING([whether we can compile AVX intrinsics in C])
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([
- #include <x86intrin.h>
- int main(int argc, char **argv){
- double* q;
- __m256d a1_1 = _mm256_load_pd(q);
- return 0;
- }
- ])],
- [can_compile_avx=yes],
- [can_compile_avx=no]
-)
-AC_MSG_RESULT([${can_compile_avx}])
-if test "${can_compile_avx}" = "yes" ; then
-  AC_MSG_CHECKING([whether we can compile AVX intrinsics in C++])
-  AC_LANG_PUSH([C++])
-  AC_COMPILE_IFELSE([AC_LANG_SOURCE([
-   #include <x86intrin.h>
-   int main(int argc, char **argv){
-   double* q;
-   __m256d a1_1 = _mm256_load_pd(q);
-   return 0;
-   }
-   ])],
-   [can_compile_avx=yes],
-   [can_compile_avx=no]
-  )
-  AC_LANG_POP([C++])
-  AC_MSG_RESULT([${can_compile_avx}])
-  if test "${can_compile_avx}" = "no" ; then
-    AC_MSG_WARN([Cannot compile C++ with AVX: disabling AVX alltogether])
-  fi
-fi
-
-AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C])
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([
- #include <x86intrin.h>
- int main(int argc, char **argv){
- double* q;
- __m256d q1 = _mm256_load_pd(q);
- __m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
- return 0;
- }
- ])],
- [can_compile_avx2=yes],
- [can_compile_avx2=no]
-)
-AC_MSG_RESULT([${can_compile_avx2}])
-if test "${can_compile_avx2}" = "yes" ; then
-  AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C++])
-  AC_LANG_PUSH([C++])
-  AC_COMPILE_IFELSE([AC_LANG_SOURCE([
-   #include <x86intrin.h>
-   int main(int argc, char **argv){
-   double* q;
-   __m256d q1 = _mm256_load_pd(q);
-   __m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
-   return 0;
-   }
-   ])],
-   [can_compile_avx2=yes],
-   [can_compile_avx2=no]
-  )
-  AC_LANG_POP([C++])
-  AC_MSG_RESULT([${can_compile_avx2}])
-  if test "${can_compile_avx2}" = "no" ; then
-    AC_MSG_WARN([Cannot compile C++ with AVX2!])
-  fi
-fi
-
-
-if test "${can_compile_avx}" = "yes" ; then
-  install_real_avx_block2=yes
-  install_real_avx_block4=yes
-  install_real_avx_block6=yes
-
-  install_complex_avx_block1=yes
-  install_complex_avx_block2=yes
-
-  want_avx=yes
-else
-  install_real_avx_block2=no
-  install_real_avx_block4=no
-  install_real_avx_block6=no
-
-  install_complex_avx_block1=no
-  install_complex_avx_block2=no
-
-  want_avx=yes
-fi
-
-dnl set the AVX optimization flags if this option is specified
-AC_MSG_CHECKING(whether AVX optimization flags should be set automatically)
-AC_ARG_WITH([avx-optimization],
-		AS_HELP_STRING([--with-avx-optimization],
-			       [use AVX optimization, default no.]),
-	      [with_avx_optimization=yes],
-	      [with_avx_optimization=no])
-AC_MSG_RESULT([${with_avx_optimization}])
-if test x"${with_avx_optimization}" = x"yes"; then
- CFLAGS="$CFLAGS -funsafe-loop-optimizations -funsafe-math-optimizations -ftree-vect-loop-version -ftree-vectorize"
- CXXFLAGS="$CXXFLAGS -funsafe-loop-optimizations -funsafe-math-optimizations -ftree-vect-loop-version -ftree-vectorize"
-fi
-
-AC_LANG([Fortran])
-AC_FC_FREEFORM
-AC_FC_MODULE_FLAG
-AC_FC_MODULE_OUTPUT_FLAG
-AC_FC_LIBRARY_LDFLAGS
-
-save_FCFLAGS=$FCFLAGS
-save_LDFLAGS=$LDFLAGS
-
-AC_ARG_VAR([SCALAPACK_LDFLAGS],[Extra LDFLAGS necessary to link a program with Scalapack])
-AC_ARG_VAR([SCALAPACK_FCFLAGS],[Extra FCFLAGS necessary to compile a Fortran program with Scalapack])
-
-FCFLAGS="$FCFLAGS $SCALAPACK_FCFLAGS"
-LDFLAGS="$LDFLAGS $SCALAPACK_LDFLAGS"
-
-dnl check whether fortran error_unit is defined
-AC_MSG_CHECKING([whether Fortran module iso_fortran_env is available])
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([
-  program test_error_unit
-    use ISO_FORTRAN_ENV, only : error_unit
-    implicit none
-
-    write(error_unit,*) "error_unit is defined"
-  end program
-])],
-  [can_use_iso_fortran_env=yes],
-  [can_use_iso_fortran_env=no]
-)
-AC_MSG_RESULT([${can_use_iso_fortran_env}])
-
-dnl check whether one can link with specified MKL (desired method)
-AC_MSG_CHECKING([whether we can compile a Fortran program using MKL])
-
-
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([
-  program test_mkl
-    use mkl_service
-    character*198 :: string
-    call mkl_get_version_string(string)
-    write(*,'(a)') string
-  end program
-])],
-  [can_compile_with_mkl=yes],
-  [can_compile_with_mkl=no]
-)
-AC_MSG_RESULT([${can_compile_with_mkl}])
-
-if test x"$can_compile_with_mkl" = x"yes" ; then
-  AC_MSG_CHECKING([whether we can link a Fortran program with MKL])
-  AC_LINK_IFELSE([AC_LANG_SOURCE([
-    program test_mkl
-      use mkl_service
-      character*198 :: string
-      call mkl_get_version_string(string)
-      write(*,'(a)') string
-    end program
-  ])],
-    [have_mkl=yes],
-    [have_mkl=no]
-  )
-  AC_MSG_RESULT([${have_mkl}])
-fi
-
-dnl if not mkl, check all the necessary individually
-if test x"${have_mkl}" = x"yes" ; then
-  WITH_MKL=1
-else
-
-  dnl first check blas
-  AC_SEARCH_LIBS([dgemm],[blas],[have_blas=yes],[have_blas=no])
-  AC_MSG_CHECKING([whether we can link a program with a blas lib])
-  AC_MSG_RESULT([${have_blas}])
-
-  if test x"${have_blas}" = x"no" ; then
-    AC_MSG_ERROR([could not link with blas: specify path])
-  fi
-  dnl now lapack
-  AC_SEARCH_LIBS([dlarrv],[lapack],[have_lapack=yes],[have_lapack=no])
-  AC_MSG_CHECKING([whether we can link a program with a lapack lib])
-  AC_MSG_RESULT([${have_lapack}])
-
-  if test x"${have_lapack}" = x"no" ; then
-    AC_MSG_ERROR([could not link with lapack: specify path])
-  fi
-
-  dnl test whether scalapack already contains blacs
-  scalapack_libs="mpiscalapack scalapack"
-  old_LIBS="$LIBS"
-  for lib in ${scalapack_libs}; do
-    LIBS="-l${lib} ${old_LIBS}"
-    AC_MSG_CHECKING([whether -l${lib} already contains a BLACS implementation])
-    AC_LINK_IFELSE([AC_LANG_FUNC_LINK_TRY([blacs_gridinit])],[blacs_in_scalapack=yes],[blacs_in_scalapack=no])
-    AC_MSG_RESULT([${blacs_in_scalapack}])
-    if test x"${blacs_in_scalapack}" = x"yes"; then
-      break
-    fi
-  done
-
-  if test x"${blacs_in_scalapack}" = x"no"; then
-    LIBS="${old_LIBS}"
-
-    dnl Test for stand-alone blacs
-    AC_SEARCH_LIBS([bi_f77_init],[mpiblacsF77init],[],[],[-lmpiblacs])
-    AC_SEARCH_LIBS([blacs_gridinit],[mpiblacs blacs],[have_blacs=yes],[have_blacs=no])
-
-    if test x"${have_blacs}" = x"no"; then
-      AC_MSG_ERROR([No usable BLACS found. If installed in a non-standard place, please specify suitable LDFLAGS and FCFLAGS as arguments to configure])
-    fi
-  fi
-
-  AC_SEARCH_LIBS([pdtran],[$scalapack_libs],[have_scalapack=yes],[have_scalapack=no])
-
-  if test x"${have_scalapack}" = x"no" ; then
-    AC_MSG_ERROR([could not link with scalapack: specify path])
-  fi
-
-  dnl check whether we can link alltogehter
-  AC_MSG_CHECKING([whether we can link a Fortran program with all blacs/scalapack])
-  AC_LINK_IFELSE([AC_LANG_SOURCE([
-    program dgemm_test
-
-      integer , parameter:: M = 4, N = 3, K = 2
-      real :: A(M,K), B(K,N), C(M,N)
-
-      call dgemm('N','N',M,N,K,1.0,A,M,B,K,0.0,C,M)
-
-     end program dgemm_test
-     ])],
-     [can_link_with_blacs_scalapack=yes],
-     [can_link_with_blacs_scalapack=no]
-   )
-   AC_MSG_RESULT([${can_link_with_blacs_scalapack}])
-
-   if test x"${can_link_with_blacs_scalapack}" = x"yes" ; then
-     WITH_BLACS=1
-   else
-   AC_MSG_ERROR([We can neither link with MKL or another Scalpack. Please specify SCALAPACK_LDFLAGS and SCALAPACK_FCFLAGS!])
-   fi
-fi
-
-dnl important: reset them again!
-FCFLAGS=$save_FCFLAGS
-LDFLAGS=$save_LDFLAGS
-
-dnl check for intrinsic fortran function of 2003 standard
-AC_MSG_CHECKING([whether we can use the intrinsic Fortran function "get_environment_variable"])
-
-
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([
-  program test_get_environment
-
-    character(len=256) :: homedir
-    call get_environment_variable("HOME",homedir)
-  end program
-])],
-  [fortran_can_check_environment=yes],
-  [fortran_can_check_environment=no]
-)
-AC_MSG_RESULT([${fortran_can_check_environment}])
-
-dnl now check which kernels can be compiled
-
-dnl the checks for SSE were already done before
-dnl the checks for AVX were already done before
-
-dnl check BGP kernel
-AC_MSG_CHECKING([whether we can compile with BGP intrinsics])
-
-
-AC_LINK_IFELSE([AC_LANG_SOURCE([
-  program test_bgp
-    complex*16 :: y3,q3,h2
-    y3 = fxcpmadd(y3,q3,h2)
-
-  end program
-])],
-  [can_compile_bgp=yes],
-  [can_compile_bgp=no]
-)
-AC_MSG_RESULT([${can_compile_bgp}])
-
-if test x"${can_compile_bgp}" = x"yes" ; then
-  install_real_bgp=yes
-  install_complex_bgp=yes
-else
-  install_real_bgp=no
-  install_complex_bgp=no
-fi
-
-dnl check BGQ kernel
-AC_MSG_CHECKING([whether we can compile with BGQ intrinsics])
-
-AC_LINK_IFELSE([AC_LANG_SOURCE([
-  program test_bgq
-    VECTOR(REAL(8))::QPX_h2
-    real*8         :: hh(10,2)
-    QPX_h2 = VEC_SPLATS(hh(2,2))
-
-  end program
-])],
-  [can_compile_bgq=yes],
-  [can_compile_bgq=no]
-)
-AC_MSG_RESULT([${can_compile_bgq}])
-
-if test x"${can_compile_bgq}" = x"yes" ; then
-  install_real_bgq=yes
-  install_complex_bgq=yes
-else
-  install_real_bgq=no
-  install_complex_bgq=no
-fi
-
-
-dnl environment variable setting of kernel
-if test x"${fortran_can_check_environment}" = x"yes" ; then
- AC_DEFINE([HAVE_ENVIRONMENT_CHECKING],[1],[Fortran can querry environment variables])
-fi
-
-dnl last check whether user wants to compile only a specific kernel
-dnl
-
-m4_include([m4/ax_elpa_specific_kernels.m4])
-
-dnl real kernels
-  dnl do not remove this variable it is needed in the macros
-  use_specific_real_kernel=no
-
-  dnl generic kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-generic-kernel-only],[generic-kernel],[install_real_generic])
-
-  dnl generic-simple kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-generic-simple-kernel-only],[generic-simple-kernel],[install_real_generic_simple])
-
-  dnl sse kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-kernel-only],[sse-kernel],[install_real_sse])
-
-  dnl bgp kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgp-kernel-only],[bgp-kernel],[install_real_bgp])
-
-  dnl bgq kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgq-kernel-only],[bgq-kernel],[install_real_bgq])
-
-  dnl real-avx-block2 kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block2-kernel-only],[real-avx-block2-kernel],[install_real_avx_block2])
-
-  dnl real-avx-block4 kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block4-kernel]-only,[real-avx-block4-kernel],[install_real_avx_block4])
-
-  dnl real-avx-block6 kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block6-kernel-only],[real-avx-block6-kernel],[install_real_avx_block6])
-
-
-dnl complex kernels
-
-  dnl do not remove this variable it is needed in the macros
-  use_specific_complex_kernel=no
-
-  dnl generic kernel
-  DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-generic-kernel-only],[generic-kernel],[install_complex_generic])
-
-  dnl generic-simple kernel
-  DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-generic-simple-kernel-only],[generic-simple-kernel],[install_complex_generic_simple])
-
-  dnl sse kernel
-  DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-kernel-only],[sse-kernel],[install_complex_sse])
-
-  dnl complex-bqp kernel
-  DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgp-kernel-only],[bgp-kernel],[install_complex_bgp])
-
-  dnl complex-bqq kernel
-  DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgq-kernel-only],[bgq-kernel],[install_complex_bgq])
-
-  dnl complex-avx-block1 kernel
-  DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block1-kernel-only],[complex-avx-block1-kernel],[install_complex_avx_block1])
-
-  dnl complex-avx-block2 kernel
-  DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block2-kernel-only],[complex-avx-block2-kernel],[install_complex_avx_block2])
-
-dnl set the conditionals according to the previous tests
-if test x"${can_use_iso_fortran_env}" = x"yes" ; then
- AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env])
-fi
-
-AM_CONDITIONAL([WITH_REAL_GENERIC_KERNEL],[test x"$install_real_generic" = x"yes"])
-if test x"${install_real_generic}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_GENERIC_KERNEL],[1],[can use real generic kernel])
-fi
-
-AM_CONDITIONAL([WITH_COMPLEX_GENERIC_KERNEL],[test x"$install_complex_generic" = x"yes"])
-if test x"${install_complex_generic}" = x"yes" ; then
- AC_DEFINE([WITH_COMPLEX_GENERIC_KERNEL],[1],[can use complex generic kernel])
-fi
-
-AM_CONDITIONAL([WITH_REAL_GENERIC_SIMPLE_KERNEL],[test x"$install_real_generic_simple" = x"yes"])
-if test x"${install_real_generic_simple}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_GENERIC_SIMPLE_KERNEL],[1],[can use real generic-simple kernel])
-fi
-
-AM_CONDITIONAL([WITH_COMPLEX_GENERIC_SIMPLE_KERNEL],[test x"$install_complex_generic_simple" = x"yes"])
-if test x"${install_complex_generic_simple}" = x"yes" ; then
- AC_DEFINE([WITH_COMPLEX_GENERIC_SIMPLE_KERNEL],[1],[can use complex generic-simple kernel])
-fi
-
-AM_CONDITIONAL([WITH_REAL_SSE_KERNEL],[test x"$install_real_sse" = x"yes"])
-if test x"${install_real_sse}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_SSE_KERNEL],[1],[can use real SSE kernel])
-fi
-
-AM_CONDITIONAL([WITH_COMPLEX_SSE_KERNEL],[test x"$install_complex_sse" = x"yes"])
-if test x"${install_complex_sse}" = x"yes" ; then
- AC_DEFINE([WITH_COMPLEX_SSE_KERNEL],[1],[can use complex SSE kernel])
-fi
-
-AM_CONDITIONAL([WITH_REAL_AVX_BLOCK2_KERNEL],[test x"$install_real_avx_block2" = x"yes"])
-if test x"${install_real_avx_block2}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_AVX_BLOCK2_KERNEL],[1],[can use real_avx_block2 kernel])
-fi
-
-AM_CONDITIONAL([WITH_REAL_AVX_BLOCK4_KERNEL],[test x"$install_real_avx_block4" = x"yes"])
-if test x"${install_real_avx_block4}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_AVX_BLOCK4_KERNEL],[1],[can use real_avx_block4 kernel])
-fi
-
-AM_CONDITIONAL([WITH_REAL_AVX_BLOCK6_KERNEL],[test x"$install_real_avx_block6" = x"yes"])
-if test x"${install_real_avx_block6}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_AVX_BLOCK6_KERNEL],[1],[can use real_avx_block6 kernel])
-fi
-
-AM_CONDITIONAL([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[test x"$install_complex_avx_block1" = x"yes"])
-if test x"${install_complex_avx_block1}" = x"yes" ; then
- AC_DEFINE([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[1],[can use complex_avx_block1 kernel])
-fi
-
-AM_CONDITIONAL([WITH_COMPLEX_AVX_BLOCK2_KERNEL],[test x"$install_complex_avx_block2" = x"yes"])
-if test x"${install_complex_avx_block2}" = x"yes" ; then
- AC_DEFINE([WITH_COMPLEX_AVX_BLOCK2_KERNEL],[1],[can use complex_avx_block2 kernel])
-fi
-
-AM_CONDITIONAL([WITH_REAL_BGP_KERNEL],[test x"$install_real_bgp" = x"yes"])
-if test x"${install_real_bgp}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_BGP_KERNEL],[1],[can use real BGP kernel])
-fi
-
-AM_CONDITIONAL([WITH_REAL_BGQ_KERNEL],[test x"$install_real_bgq" = x"yes"])
-if test x"${install_real_bgq}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_BGQ_KERNEL],[1],[can use real BGQ kernel])
-fi
-
-if test x"${use_specific_complex_kernel}" = x"no" ; then
- AC_DEFINE([WITH_NO_SPECIFIC_COMPLEX_KERNEL],[1],[do not use only one specific complex kernel (set at compile time)])
-fi
-
-if test x"${use_specific_real_kernel}" = x"no" ; then
- AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)])
-fi
-
-LT_INIT
-
-DX_PDF_FEATURE(OFF)
-DX_PS_FEATURE(OFF)
-DX_MAN_FEATURE(ON)
-DX_HTML_FEATURE(ON)
-DX_INIT_DOXYGEN([ELPA], [Doxyfile], [docs])
-
-DESPERATELY_WANT_ASSUMED_SIZE=0
-if text x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then
-  AC_DEFINE([DESPERATELY_WANT_ASSUMED_SIZE],[1],[use assumed size arrays, even if not debuggable])
-fi
-
-AC_SUBST([WITH_MKL])
-AC_SUBST([WITH_BLACS])
-AC_SUBST([with_amd_bulldozer_kernel])
-AC_SUBST([FC_MODINC])
-AC_SUBST([FC_MODOUT])
-AC_SUBST([OPENMP_CFLAGS])
-AC_SUBST([OPENMP_FCFLAGS])
-AC_SUBST([OPENMP_LDFLAGS])
-#AC_SUBST(OPT_FCFLAGS)
-AC_SUBST([DOXYGEN_OUTPUT_DIR], [docs])
-
-rm -rf modules/ .fortran_dependencies/
-mkdir modules
-
-#gl_VISIBILITY
-#AH_BOTTOM([#if HAVE_VISIBILITY
-#define EXPORTED __attribute__((__visibility__("default")))
-#define HIDDEN   __attribute__((__visibility__("hidden")))
-#else
-#define EXPORTED
-#define HIDDEN
-#endif])
-
-
-# Some part of libtool is too smart and tries to parse the output of
-#   gfortran -v
-# and catches anything that resembles a -l library link specification.
-# Unfortunately, recent versions of gfortran emit
-#   -l gfortran
-# with a space between -l and gfortran. The empty -l is then included
-# into "postdeps_FC" and causes linking errors later on.
-postdeps_FC=$(echo $postdeps_FC | sed 's/-l //g')
-
-if test x"${enable_openmp}" = x"yes"; then
-	SUFFIX="_openmp"
-else
-	SUFFIX=""
-fi
-AC_SUBST([SUFFIX])
-AC_SUBST([PKG_CONFIG_FILE],[elpa${SUFFIX}-${PACKAGE_VERSION}.pc])
-
-AC_CONFIG_FILES([
-  Makefile
-  Doxyfile
-  ${PKG_CONFIG_FILE}:elpa.pc.in
-])
-
-AC_OUTPUT
-
-
-if test -e config.h ; then
-  grep "^#define" config.h > config-f90.h
-else
-  echo "Warning! No config.h was generated, check for errors and re-run configure!"
-  exit 1
-fi
-
-echo "Generating elpa/elpa_generated.h..."
-mkdir -p elpa
-grep -h "^ *!c>" $srcdir/src/elpa_c_interface.F90 | sed 's/^ *!c>//;' > elpa/elpa_generated.h || exit 1
-
-echo "Generating test/shared_sources/generated.h..."
-mkdir -p test/shared_sources
-grep -h "^ *!c>" $srcdir/test/shared_sources/*.F90 | sed 's/^ *!c>//;' > test/shared_sources/generated.h || exit 1
-
-if test "${can_compile_avx}" = "no" ; then
-  if test x"${want_avx}" = x"yes" ; then
-    AC_MSG_WARN([Could not compile AVX instructions])
-  fi
-fi
-if test "${can_compile_avx2}" = "no" ; then
-  if test x"${want_avx}" = x"yes" ; then
-    AC_MSG_WARN([Could not compile AVX2 instructions])
-  fi
-fi
-
diff --git a/doxygen.am b/doxygen.am
deleted file mode 100644
index f869a591a..000000000
--- a/doxygen.am
+++ /dev/null
@@ -1,156 +0,0 @@
-## --------------------------------- ##
-## Format-independent Doxygen rules. ##
-## --------------------------------- ##
-
-if DX_COND_doc
-
-## ------------------------------- ##
-## Rules specific for HTML output. ##
-## ------------------------------- ##
-
-if DX_COND_html
-
-DX_CLEAN_HTML = @DX_DOCDIR@/html
-
-endif DX_COND_html
-
-## ------------------------------ ##
-## Rules specific for CHM output. ##
-## ------------------------------ ##
-
-if DX_COND_chm
-
-DX_CLEAN_CHM = @DX_DOCDIR@/chm
-
-if DX_COND_chi
-
-DX_CLEAN_CHI = @DX_DOCDIR@/@PACKAGE@.chi
-
-endif DX_COND_chi
-
-endif DX_COND_chm
-
-## ------------------------------ ##
-## Rules specific for MAN output. ##
-## ------------------------------ ##
-
-if DX_COND_man
-
-DX_CLEAN_MAN = @DX_DOCDIR@/man
-
-endif DX_COND_man
-
-## ------------------------------ ##
-## Rules specific for RTF output. ##
-## ------------------------------ ##
-
-if DX_COND_rtf
-
-DX_CLEAN_RTF = @DX_DOCDIR@/rtf
-
-endif DX_COND_rtf
-
-## ------------------------------ ##
-## Rules specific for XML output. ##
-## ------------------------------ ##
-
-if DX_COND_xml
-
-DX_CLEAN_XML = @DX_DOCDIR@/xml
-
-endif DX_COND_xml
-
-## ----------------------------- ##
-## Rules specific for PS output. ##
-## ----------------------------- ##
-
-if DX_COND_ps
-
-DX_CLEAN_PS = @DX_DOCDIR@/@PACKAGE@.ps
-
-DX_PS_GOAL = doxygen-ps
-
-doxygen-ps: @DX_DOCDIR@/@PACKAGE@.ps
-
-@DX_DOCDIR@/@PACKAGE@.ps: @DX_DOCDIR@/@PACKAGE@.tag
-	cd @DX_DOCDIR@/latex; \
-	rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
-	$(DX_LATEX) refman.tex; \
-	$(MAKEINDEX_PATH) refman.idx; \
-	$(DX_LATEX) refman.tex; \
-	countdown=5; \
-	while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \
-	                  refman.log > /dev/null 2>&1 \
-	   && test $$countdown -gt 0; do \
-	    $(DX_LATEX) refman.tex; \
-	    countdown=`expr $$countdown - 1`; \
-	done; \
-	$(DX_DVIPS) -o ../@PACKAGE@.ps refman.dvi
-
-endif DX_COND_ps
-
-## ------------------------------ ##
-## Rules specific for PDF output. ##
-## ------------------------------ ##
-
-if DX_COND_pdf
-
-DX_CLEAN_PDF = @DX_DOCDIR@/@PACKAGE@.pdf
-
-DX_PDF_GOAL = doxygen-pdf
-
-doxygen-pdf: @DX_DOCDIR@/@PACKAGE@.pdf
-
-@DX_DOCDIR@/@PACKAGE@.pdf: @DX_DOCDIR@/@PACKAGE@.tag
-	cd @DX_DOCDIR@/latex; \
-	rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
-	$(DX_PDFLATEX) refman.tex; \
-	$(DX_MAKEINDEX) refman.idx; \
-	$(DX_PDFLATEX) refman.tex; \
-	countdown=5; \
-	while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \
-	                  refman.log > /dev/null 2>&1 \
-	   && test $$countdown -gt 0; do \
-	    $(DX_PDFLATEX) refman.tex; \
-	    countdown=`expr $$countdown - 1`; \
-	done; \
-	mv refman.pdf ../@PACKAGE@.pdf
-
-endif DX_COND_pdf
-
-## ------------------------------------------------- ##
-## Rules specific for LaTeX (shared for PS and PDF). ##
-## ------------------------------------------------- ##
-
-if DX_COND_latex
-
-DX_CLEAN_LATEX = @DX_DOCDIR@/latex
-
-endif DX_COND_latex
-
-.PHONY: doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL)
-
-.INTERMEDIATE: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL)
-
-doxygen-run: @DX_DOCDIR@/@PACKAGE@.tag
-
-doxygen-doc: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL)
-
-@DX_DOCDIR@/@PACKAGE@.tag: $(DX_CONFIG) $(pkginclude_HEADERS)
-	rm -rf @DX_DOCDIR@
-	$(DX_ENV) $(DX_DOXYGEN) $(DX_CONFIG)
-
-DX_CLEANFILES = \
-	@DX_DOCDIR@/@PACKAGE@.tag \
-	-r \
-	$(DX_CLEAN_HTML) \
-	$(DX_CLEAN_CHM) \
-	$(DX_CLEAN_CHI) \
-	$(DX_CLEAN_MAN) \
-	$(DX_CLEAN_RTF) \
-	$(DX_CLEAN_XML) \
-	$(DX_CLEAN_PS) \
-	$(DX_CLEAN_PDF) \
-	$(DX_CLEAN_LATEX)
-
-endif DX_COND_doc
diff --git a/elpa.pc.in b/elpa.pc.in
deleted file mode 100644
index cac98a0e8..000000000
--- a/elpa.pc.in
+++ /dev/null
@@ -1,12 +0,0 @@
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
-
-Name: @PACKAGE_NAME@@SUFFIX@
-Description: ELPA is a Fortran-based high-performance computational library for the (massively) parallel solution of symmetric or Hermitian, standard or generalized eigenvalue problems.
-Version: @PACKAGE_VERSION@
-URL: @PACKAGE_URL@
-Libs: -L${libdir} -lelpa@SUFFIX@ @LIBS@ @OPENMP_FCFLAGS@
-Cflags: -I${includedir}/elpa@SUFFIX@-@PACKAGE_VERSION@ @OPENMP_CFLAGS@
-fcflags= -I${includedir}/elpa@SUFFIX@-@PACKAGE_VERSION@/modules @OPENMP_FCFLAGS@
diff --git a/elpa/elpa.h b/elpa/elpa.h
deleted file mode 100644
index 54f7114f4..000000000
--- a/elpa/elpa.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#include <elpa/elpa_kernel_constants.h>
-#include <elpa/elpa_generated.h>
diff --git a/elpa/elpa_kernel_constants.h b/elpa/elpa_kernel_constants.h
deleted file mode 100644
index 45ae16406..000000000
--- a/elpa/elpa_kernel_constants.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#define ELPA2_REAL_KERNEL_GENERIC 1
-#define ELPA2_REAL_KERNEL_GENERIC_SIMPLE 2
-#define ELPA2_REAL_KERNEL_BGP 3
-#define ELPA2_REAL_KERNEL_BGQ 4
-#define ELPA2_REAL_KERNEL_SSE 5
-#define ELPA2_REAL_KERNEL_AVX_BLOCK2 6
-#define ELPA2_REAL_KERNEL_AVX_BLOCK4 7
-#define ELPA2_REAL_KERNEL_AVX_BLOCK6 8
-
-#define ELPA2_NUMBER_OF_REAL_KERNELS 8
-
-
-#define ELPA2_COMPLEX_KERNEL_GENERIC 1
-#define ELPA2_COMPLEX_KERNEL_GENERIC_SIMPLE 2
-#define ELPA2_COMPLEX_KERNEL_BGP 3
-#define ELPA2_COMPLEX_KERNEL_BGQ 4
-#define ELPA2_COMPLEX_KERNEL_SSE 5
-#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 6
-#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 7
-
-#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 7
diff --git a/fdep/LICENSE b/fdep/LICENSE
deleted file mode 100644
index 0c660e91b..000000000
--- a/fdep/LICENSE
+++ /dev/null
@@ -1,19 +0,0 @@
-Copyright (c) 2013 Lorenz Hüdepohl
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
diff --git a/fdep/README b/fdep/README
deleted file mode 100644
index eb6e3186a..000000000
--- a/fdep/README
+++ /dev/null
@@ -1,99 +0,0 @@
-fdep
-----
-
-fdep is a small set of scripts to teach autoconf/automake (using GNU make)
-about the additional dependencies in Fortran 90 files due to modules.
-
-With this, Fortran files can be listed in any order in Makefile.am and parallel
-builds work.
-
-
-Usage
------
-
-  Put this project as a directory "fdep" in your source code, place the two
-  lines
-
-    m4_include([fdep/fortran_dependencies.m4])
-    FDEP_F90_GNU_MAKE_DEPS
-
-  in your configure.ac, and add a single line
-
-    @FORTRAN_MODULE_DEPS@
-
-  in your Makefile.am. All .F90 files of all programs in bin_PROGRAMS and all
-  libraries in lib_LTLIBRARIES will now be scanned for modules and the
-  resulting dependencies will be honoured.
-
-
-What is the problem with Fortran 90 modules and make dependencies?
-------------------------------------------------------------------
-
-  In Fortran 90 source files one can define any number of "modules", containing
-  variable and function definitions. The names of the modules defined in a file
-  can be arbitrary.
-
-  In another source file these modules can be used, informing the Fortran
-  compiler about the definitions in these modules (e.g. to do type-checking).
-  This creates a problem, as the compiler has to know somehow where the module
-  is defined.
-
-  The usual solution employed by almost every Fortran compiler is to create
-  special "module" files for each module contained in a source file during
-  compilation. Their file name is derived by a compiler-specific recipe of the
-  modules identifier (usually the lower-cased module's identifier plus ".mod",
-  so "foo_module.mod" and "some_other_module.mod"). When the compiler
-  encounters a "use" statement during the compilation of another file, it
-  confers to this file to import the definitions of the module.
-
-  That means, you cannot compile files using modules defined in yet un-compiled
-  files, one has to tell make about this dependency.
-
-  (A primitive solution to this problem is listing the file in a pre-sorted
-   order, so that files defining modules are compiled first.
-
-   However, that way the dependency-graph make knows about is incomplete and
-   parallel builds will fail with a high probability)
-
-
-How does fdep solve this problem technically?
----------------------------------------------
-
-  As the name of the module files can be an arbitrary (and some compilers might
-  even save the module definitions in some completely different way), fdep
-  tells make about the module dependencies as a relation directly between
-  object files, e.g. when a file 'b.f90' is using any module of file 'a.f90',
-  fdep adds a dependency of
-
-    b.o: a.o
-
-
-  More specifically, the perl-script fortran_dependencies.pl is run by make to
-  create a file .fortran_dependencies/dependencies.mk, which is then included.
-  To do this, first every source file (for every defined program and library)
-  is scanned for lines with "module" or "use" statements. These are saved in
-  two additional files (.use_mods and .def_mods) per source file and contain
-  lists of defined and required modules. The perl script then reads these in
-  and produces the appropriate rules.
-
-
-Drawbacks
----------
-
-  GNU make is required. The detailed dependency graph due to "module" and "use"
-  statements is only available after pre-processing, when autoconf and even
-  configure is long over. To still get proper dependencies, fdep uses GNU
-  make's feature to include generated sub-Makefiles during a running make
-  invocation.
-
-
-License
--------
-
-  fdep is released under the MIT License. See the LICENSE file for details.
-
-
-Contributing
-------------
-
-  Send your patches or pull-request to dev@stellardeath.org
diff --git a/fdep/fortran_dependencies.m4 b/fdep/fortran_dependencies.m4
deleted file mode 100644
index 1e3beb8f8..000000000
--- a/fdep/fortran_dependencies.m4
+++ /dev/null
@@ -1,24 +0,0 @@
-dnl Copyright 2015 Lorenz Hüdepohl
-dnl
-dnl This file is part of fdep and licensed under the MIT license
-dnl see the file LICENSE for more information
-dnl
-AC_DEFUN([FDEP_F90_GNU_MAKE_DEPS],[
-AC_MSG_CHECKING([for GNU make])
-for a in "$MAKE" make gmake gnumake ; do
-        if test -z "$a" ; then continue ; fi ;
-        if  ( sh -c "$a --version" 2> /dev/null | grep GNU  2>&1 > /dev/null ) ;  then
-                _fdep_gnu_make_command=$a ;
-                break;
-        fi
-done ;
-AC_MSG_RESULT([$_fdep_gnu_make_command])
-if test x$_fdep_gnu_make_command = x ; then
-	AC_MSG_ERROR([Need GNU Make])
-fi
-AC_SUBST([FORTRAN_MODULE_DEPS], ["
-CLEANFILES +=
-include ${srcdir}/fdep/fortran_dependencies.mk
-"])
-AM_SUBST_NOTMAKE([FORTRAN_MODULE_DEPS])
-])
diff --git a/fdep/fortran_dependencies.mk b/fdep/fortran_dependencies.mk
deleted file mode 100644
index b77292bba..000000000
--- a/fdep/fortran_dependencies.mk
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright 2015 Lorenz Hüdepohl
-#
-# This file is part of fdep and licensed under the MIT license
-# see the file LICENSE for more information
-#
-
-define translate_name
-$(subst -,_,$(subst .,_,$1))
-endef
-
-_f90_verbose = $(_f90_verbose_$(V))
-_f90_verbose_ = $(_f90_verbose_$(AM_DEFAULT_VERBOSITY))
-_f90_verbose_0 = @echo "  $1";
-_f90_targets = $(call translate_name,$(PROGRAMS) $(LTLIBRARIES))
-
-FORTRAN_CPP ?= cpp -P -traditional -Wall -Werror
-
-# $1 source files
-#
-# returns: file without any .F90 .f90 .F .f extension
-define strip_fortran_ext
-$(patsubst %.F90,%,$(patsubst %.f90,%,$(patsubst %.F,%,$(patsubst %.f,%,$1))))
-endef
-
-# $1 program
-#
-# returns:
-#  '1' if object files for target $1 are prefixed due to 'per-target' flags,
-#  '' (the empty string) otherwise. See the automake manual for 'per-target'
-#  compilation
-#
-define is_per_target
-$(if $(filter $(call strip_fortran_ext,$(firstword $(call fortran_sources,$1))),$(patsubst %.o,%,$(patsubst %.lo,%,$($1_OBJECTS)))),,1)
-endef
-
-# $1 top-level target name (i.e. an entry of _f90_targets)
-#
-# returns: all target source files matching *.F90 *.f90 *.F *.f
-define fortran_sources
-$(filter %.F90 %.f90 %.F %.f,$($1_SOURCES))
-endef
-
-# $1 top-level target name
-#
-# returns: the appropriate extension (i.e. 'o' for normal programs, '.lo' for libraries)
-define object_extension
-$(if $(filter $1,$(PROGRAMS)),o,lo)
-endef
-
-# $1 source file
-# $2 stem
-# $3 program
-# $4 kind of file ('use' or 'def')
-define modinfo_name
-$(dir $1)$(2)$(call strip_fortran_ext,$(notdir $1)).$4_mods_$(patsubst .,_,$3).$(call object_extension,$3)
-endef
-
-# $1 source_file
-# $2 stem
-# $3 program
-define module_targets
-$(eval _$(3)_use_mods += $(call modinfo_name,$1,$2,$3,use))
-$(call modinfo_name,$1,$2,$3,use): $1 $(dir $1)$(am__dirstamp)
-	$(call _f90_verbose,F90 USE  [$3] $$<)$(FORTRAN_CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $($p_CPPFLAGS) $(CPPFLAGS) -o /dev/stdout $$< | \
-		grep -i -o '^ *use [^ ,!:]*' | sed 's/^[[:space:]]*//;' | tr '[:upper:]' '[:lower:]' | sort -u > $$@
-
-$(eval _$(3)_def_mods += $(call modinfo_name,$1,$2,$3,def))
-$(call modinfo_name,$1,$2,$3,def): $1 $(dir $1)$(am__dirstamp)
-	$(call _f90_verbose,F90 MOD  [$3] $$<)$(FORTRAN_CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $($p_CPPFLAGS) $(CPPFLAGS) -o /dev/stdout $$< | \
-		grep -i -o '^ *module [^!]*' | sed 's/^[[:space:]]*//;' | tr '[:upper:]' '[:lower:]' | grep -v "\<procedure\>\|\<intrinsic\>" > $$@ || true
-
-endef
-$(foreach p,$(_f90_targets),$(if $(call is_per_target,$p),$(foreach s,$(call fortran_sources,$p),$(eval $(call module_targets,$s,$p-,$p))),$(foreach s,$(call fortran_sources,$p),$(eval $(call module_targets,$s,,$p)))))
-
-_f90_depdir=$(abs_builddir)/.fortran_dependencies
-_f90_depfile = $(_f90_depdir)/dependencies.mk
-
-# $1 target-name
-define recursive_lib_deps
-$(foreach l,$(call translate_name,$($1_LDADD) $($1_LIBADD)),$l $(call recursive_lib_deps,$l))
-endef
-
-define is_clean
-$(if $(filter-out mostlyclean clean distclean maintainer-clean,$(MAKECMDGOALS)),0,1)
-endef
-
-ifneq ($(call is_clean),1)
-include $(_f90_depfile)
-endif
-$(_f90_depfile): $(top_srcdir)/fdep/fortran_dependencies.pl $(foreach p,$(_f90_targets),$(_$p_use_mods) $(_$p_def_mods)) | $(foreach p,$(_f90_targets),$(_f90_depdir)/$p)
-	$(call _f90_verbose,F90 DEPS $@)echo > $@; $(foreach p,$(_f90_targets),$(top_srcdir)/fdep/fortran_dependencies.pl $p $(_$p_use_mods) $(_$p_def_mods) $(foreach l,$(call recursive_lib_deps,$p),$(_$l_use_mods) $(_$l_def_mods)) >> $@; )
-
-$(_f90_depdir):
-	@mkdir $@
-
-$(foreach p,$(_f90_targets),$(_f90_depdir)/$p): | $(_f90_depdir)
-	@mkdir $@
-
-CLEANFILES += $(foreach p,$(_f90_targets),$(_$p_def_mods) $(_$p_use_mods))
-CLEANFILES += $(foreach p,$(_f90_targets),$(_f90_depdir)/$p/*)
-CLEANFILES += $(_f90_depfile)
diff --git a/fdep/fortran_dependencies.pl b/fdep/fortran_dependencies.pl
deleted file mode 100755
index 1dc20610d..000000000
--- a/fdep/fortran_dependencies.pl
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/usr/bin/perl -w
-#
-# Copyright 2015 Lorenz Hüdepohl
-#
-# This file is part of fdep and licensed under the MIT license
-# see the file LICENSE for more information
-#
-
-use strict;
-
-my %defs = ();
-my %uses = ();
-my %files = ();
-
-my $use_re = qr/^\s*use\s+(\S+)\s*$/;
-my $def_re = qr/^\s*module\s+(\S+)\s*$/;
-
-sub add_use {
-	my ($file, $module) = @_;
-	if (defined($defs{$module}) && $defs{$module} eq $file) {
-		# do not add self-dependencies
-		return;
-	}
-	if (!defined($uses{$file})) {
-		$uses{$file} = { $module => 1 };
-	} else {
-		$uses{$file}{$module} = 1;
-	}
-}
-
-sub add_def {
-	my ($file, $module) = @_;
-	if (!defined($defs{$module})) {
-		$defs{$module} = $file;
-		if (defined($uses{$file}) && defined($uses{$file}{$module})) {
-			delete $uses{$file}{$module};
-		}
-	} else {
-		die "Module $module both defined in $file, $defs{$module}";
-	}
-}
-
-my $target = shift;
-
-foreach my $file (@ARGV) {
-	if (exists $files{$file}) {
-		next;
-	} else {
-		$files{$file} = 1;
-	}
-	my $re;
-	my $add;
-	my $object;
-	if (defined($ENV{V}) && $ENV{V} ge "2") {
-		print STDERR "fdep: Considering file $file for target $target\n";
-	}
-	if ($file =~ /^(.*)\.def_mods_[^.]*(\..*)$/) {
-		$re = $def_re;
-		$add = \&add_def;
-		$object = $1 . $2;
-	} elsif ($file =~ /^(.*)\.use_mods_[^.]*(\..*)$/) {
-		$re = $use_re;
-		$add = \&add_use;
-		$object = $1 . $2;
-	} else {
-		die "Unrecognized file extension for '$file'";
-	}
-	open(FILE,"<",$file) || die "\nCan't open $file: $!\n\n";
-	while(<FILE>) {
-		chomp;
-		$_ = lc($_);
-		if ($_ =~ $re) {
-			&$add($object, $1);
-		} else {
-			die "At $file:$.\nCannot parse module statement '$_', was expecting $re";
-		}
-	}
-	close(FILE)
-}
-
-foreach my $object (sort keys %uses) {
-	for my $m (keys %{$uses{$object}}) {
-		if (defined $defs{$m}) {
-			print "$object: ", $defs{$m}, "\n";
-		} elsif (defined($ENV{V}) && $ENV{V} ge "1") {
-			print STDERR "Warning: Cannot find definition of module $m in files for current target $target, might be external\n";
-		}
-	}
-}
diff --git a/fdep/test_project/.gitignore b/fdep/test_project/.gitignore
deleted file mode 100644
index ec595a8d2..000000000
--- a/fdep/test_project/.gitignore
+++ /dev/null
@@ -1,38 +0,0 @@
-*.def_mods
-*.la
-*.lo
-*.o
-*.use_mods
-*~
-.deps
-.dirstamp
-.fortran_dependencies
-.libs
-Makefile
-Makefile.in
-aclocal.m4
-ar-lib
-autom4te.cache
-bar
-compile
-config.guess
-config.h
-config.h.in
-config.log
-config.status
-config.sub
-configure
-foo
-fortran_mod_files
-install-sh
-libtool
-ltmain.sh
-m4/libtool.m4
-m4/ltoptions.m4
-m4/ltsugar.m4
-m4/ltversion.m4
-m4/lt~obsolete.m4
-missing
-stamp-h1
-test_bar
-test_baz
diff --git a/fdep/test_project/Makefile.am b/fdep/test_project/Makefile.am
deleted file mode 100644
index 061b5fb0d..000000000
--- a/fdep/test_project/Makefile.am
+++ /dev/null
@@ -1,28 +0,0 @@
-ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4
-AM_CFLAGS = @AM_CFLAGS@
-AM_LDFLAGS = @AM_LDFLAGS@
-
-# programs
-bin_PROGRAMS = foo test_bar test_baz
-
-foo_SOURCES = src/foo.F90 src2/baz.F90 src/bar.F90
-foo_CPPFLAGS = -DPROGRAM_foo
-foo_FCFLAGS = $(FC_MODOUT)./fortran_mod_files/foo $(FC_MODINC)./fortran_mod_files/foo
-
-test_bar_SOURCES = src/bar.F90
-test_bar_CPPFLAGS = -DPROGRAM_test_bar
-test_bar_FCFLAGS = $(FC_MODOUT)./fortran_mod_files/test_bar $(FC_MODINC)./fortran_mod_files/test_bar
-
-test_baz_SOURCES = src2/baz.F90 src/bar.F90
-test_baz_CPPFLAGS = -DPROGRAM_test_baz
-test_baz_FCFLAGS = $(FC_MODOUT)./fortran_mod_files/test_baz $(FC_MODINC)./fortran_mod_files/test_baz
-
-# a library
-lib_LTLIBRARIES = libdings-2.la
-
-libdings_2_la_SOURCES = src2/baz.F90 src/bar.F90
-libdings_2_la_FCFLAGS = $(FC_MODOUT)./fortran_mod_files/libdings-2.la $(FC_MODINC)./fortran_mod_files/libdings-2.la
-
-CLEANFILES = fortran_mod_files/*/*
-
-@FORTRAN_MODULE_DEPS@
diff --git a/fdep/test_project/autogen.sh b/fdep/test_project/autogen.sh
deleted file mode 100755
index c62236285..000000000
--- a/fdep/test_project/autogen.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/sh -e
-
-mkdir -p m4/
-
-test -n "$srcdir" || srcdir=`dirname "$0"`
-test -n "$srcdir" || srcdir=.
-
-autoreconf --force --install --verbose "$srcdir"
diff --git a/fdep/test_project/configure.ac b/fdep/test_project/configure.ac
deleted file mode 100644
index 316fd1759..000000000
--- a/fdep/test_project/configure.ac
+++ /dev/null
@@ -1,37 +0,0 @@
-AC_INIT([fdep_test], [0.1])
-AC_PREREQ([2.59])
-AM_INIT_AUTOMAKE([foreign -Wall subdir-objects])
-
-AC_CONFIG_MACRO_DIR([m4])
-AC_CONFIG_HEADERS([config.h])
-AM_SILENT_RULES([yes])
-
-# gnu-make fortran module dependencies
-m4_include([fdep/fortran_dependencies.m4])
-FDEP_F90_GNU_MAKE_DEPS
-
-# necessary tools
-AC_PROG_FC
-AC_PROG_INSTALL
-AM_PROG_CC_C_O
-AM_PROG_AR
-
-# fortran stuff
-AC_FC_MODULE_FLAG
-AC_FC_MODULE_OUTPUT_FLAG
-AC_FC_FREEFORM
-
-# libtool
-LT_INIT
-
-AC_SUBST([AM_CFLAGS])
-AC_SUBST([AM_LDFLAGS])
-AC_SUBST([FC_MODINC])
-AC_SUBST([FC_MODOUT])
-
-AC_CONFIG_FILES([Makefile])
-AC_OUTPUT
-
-for target in foo test_bar test_baz libdings-2.la; do
-	mkdir -p fortran_mod_files/$target
-done
diff --git a/fdep/test_project/fdep b/fdep/test_project/fdep
deleted file mode 120000
index a96aa0ea9..000000000
--- a/fdep/test_project/fdep
+++ /dev/null
@@ -1 +0,0 @@
-..
\ No newline at end of file
diff --git a/fdep/test_project/src/bar.F90 b/fdep/test_project/src/bar.F90
deleted file mode 100644
index 2443a5d39..000000000
--- a/fdep/test_project/src/bar.F90
+++ /dev/null
@@ -1,17 +0,0 @@
-module bar
-  implicit none
-  contains
-  function two() result(t)
-    integer :: t
-    t = 2
-  end function
-end module
-
-#ifdef PROGRAM_test_bar
-program test_bar
-  use bar, only : two
-  if (two() /= 2) then
-    stop 1
-  endif
-end program
-#endif
diff --git a/fdep/test_project/src/foo.F90 b/fdep/test_project/src/foo.F90
deleted file mode 100644
index 504f6dc0a..000000000
--- a/fdep/test_project/src/foo.F90
+++ /dev/null
@@ -1,5 +0,0 @@
-program foo
-  use bar
-  use baz
-  write(*,*) "Nop"
-end program
diff --git a/fdep/test_project/src2/baz.F90 b/fdep/test_project/src2/baz.F90
deleted file mode 100644
index 1506857bb..000000000
--- a/fdep/test_project/src2/baz.F90
+++ /dev/null
@@ -1,18 +0,0 @@
-module baz
-  use bar
-  implicit none
-  contains
-  function two_times_two() result(t)
-    integer :: t
-    t = 2 * two()
-  end function
-end module
-
-#ifdef PROGRAM_test_baz
-program test_bar
-  use baz
-  if (two_times_two() /= 4) then
-    stop 1
-  endif
-end program
-#endif
diff --git a/m4/ax_check_gnu_make.m4 b/m4/ax_check_gnu_make.m4
deleted file mode 100644
index 938aad71f..000000000
--- a/m4/ax_check_gnu_make.m4
+++ /dev/null
@@ -1,78 +0,0 @@
-# ===========================================================================
-#     http://www.gnu.org/software/autoconf-archive/ax_check_gnu_make.html
-# ===========================================================================
-#
-# SYNOPSIS
-#
-#   AX_CHECK_GNU_MAKE()
-#
-# DESCRIPTION
-#
-#   This macro searches for a GNU version of make. If a match is found, the
-#   makefile variable `ifGNUmake' is set to the empty string, otherwise it
-#   is set to "#". This is useful for including a special features in a
-#   Makefile, which cannot be handled by other versions of make. The
-#   variable _cv_gnu_make_command is set to the command to invoke GNU make
-#   if it exists, the empty string otherwise.
-#
-#   Here is an example of its use:
-#
-#   Makefile.in might contain:
-#
-#     # A failsafe way of putting a dependency rule into a makefile
-#     $(DEPEND):
-#             $(CC) -MM $(srcdir)/*.c > $(DEPEND)
-#
-#     @ifGNUmake@ ifeq ($(DEPEND),$(wildcard $(DEPEND)))
-#     @ifGNUmake@ include $(DEPEND)
-#     @ifGNUmake@ endif
-#
-#   Then configure.in would normally contain:
-#
-#     AX_CHECK_GNU_MAKE()
-#     AC_OUTPUT(Makefile)
-#
-#   Then perhaps to cause gnu make to override any other make, we could do
-#   something like this (note that GNU make always looks for GNUmakefile
-#   first):
-#
-#     if  ! test x$_cv_gnu_make_command = x ; then
-#             mv Makefile GNUmakefile
-#             echo .DEFAULT: > Makefile ;
-#             echo \  $_cv_gnu_make_command \$@ >> Makefile;
-#     fi
-#
-#   Then, if any (well almost any) other make is called, and GNU make also
-#   exists, then the other make wraps the GNU make.
-#
-# LICENSE
-#
-#   Copyright (c) 2008 John Darrington <j.darrington@elvis.murdoch.edu.au>
-#
-#   Copying and distribution of this file, with or without modification, are
-#   permitted in any medium without royalty provided the copyright notice
-#   and this notice are preserved. This file is offered as-is, without any
-#   warranty.
-
-#serial 7
-
-AC_DEFUN([AX_CHECK_GNU_MAKE], [ AC_CACHE_CHECK( for GNU make,_cv_gnu_make_command,
-                _cv_gnu_make_command='' ;
-dnl Search all the common names for GNU make
-                for a in "$MAKE" make gmake gnumake ; do
-                        if test -z "$a" ; then continue ; fi ;
-                        if  ( sh -c "$a --version" 2> /dev/null | grep GNU  2>&1 > /dev/null ) ;  then
-                                _cv_gnu_make_command=$a ;
-                                break;
-                        fi
-                done ;
-        ) ;
-dnl If there was a GNU version, then set @ifGNUmake@ to the empty string, '#' otherwise
-        if test  "x$_cv_gnu_make_command" != "x"  ; then
-                ifGNUmake='' ;
-        else
-                ifGNUmake='#' ;
-                AC_MSG_RESULT("Not found");
-        fi
-        AC_SUBST(ifGNUmake)
-] )
diff --git a/m4/ax_elpa_gpu_version_only.m4 b/m4/ax_elpa_gpu_version_only.m4
deleted file mode 100644
index d974babe0..000000000
--- a/m4/ax_elpa_gpu_version_only.m4
+++ /dev/null
@@ -1,48 +0,0 @@
-
-dnl macro for testing whether the user wanted to compile only with the GPU version
-
-dnl usage: DEFINE_OPTION([gpu-support-only],[gpu-support],[with_gpu_support],[install_gpu])
-
-AC_DEFUN([DEFINE_OPTION_GPU_SUPPORT_ONLY],[
-  AC_ARG_WITH([$1],
-               AS_HELP_STRING([--with-$1],
-                              [only compile $2 ]),
-              [with_option=yes],[with_option=no])
-
-  if test x"${with_option}" = x"yes" ; then
-    dnl make sure that all the other kernels are unset
-    install_real_generic=no
-    install_real_generic_simple=no
-    install_real_sse=no
-    install_real_bgp=no
-    install_real_bgq=no
-    install_real_avx_block2=no
-    install_real_avx_block4=no
-    install_real_avx_block6=no
-
-
-    install_complex_generic=no
-    install_complex_generic_simple=no
-    install_complex_sse=no
-    install_complex_bgp=no
-    install_complex_bgq=no
-    install_complex_avx_block1=no
-    install_complex_avx_block2=no
-
-
-    install_gpu=yes
-
-    want_avx=no
-
-    build_with_gpu_support_only=yes
-    use_specific_complex_kernel=yes
-    use_specific_real_kernel=yes
-    dnl now set the specific kernel
-    $3=yes
-
-    AC_MSG_NOTICE([ELPA will be build only with $1])
-  else
-    build_with_gpu_support_only=no
-  fi
-])
-
diff --git a/m4/ax_elpa_openmp.m4 b/m4/ax_elpa_openmp.m4
deleted file mode 100644
index 52b19ddc7..000000000
--- a/m4/ax_elpa_openmp.m4
+++ /dev/null
@@ -1,98 +0,0 @@
-# openmp.m4 serial 4
-dnl Copyright (C) 2006-2007 Free Software Foundation, Inc.
-dnl This file is free software; the Free Software Foundation
-dnl gives unlimited permission to copy and/or distribute it,
-dnl with or without modifications, as long as this notice is preserved.
-
-dnl This file can be removed once we assume autoconf >= 2.62.
-
-# _AX_ELPA_LANG_OPENMP
-# ---------------
-# Expands to some language dependent source code for testing the presence of
-# OpenMP.
-AC_DEFUN([_AX_ELPA_LANG_OPENMP],
-[_AC_LANG_DISPATCH([$0], _AC_LANG, $@)])
-
-# _AC_LANG_OPENMP(C)
-# ------------------
-m4_define([_AX_ELPA_LANG_OPENMP(C)],
-[
-#ifndef _OPENMP
- choke me
-#endif
-#include <omp.h>
-int main () { return omp_get_num_threads (); }
-])
-
-# _AX_ELPA_LANG_OPENMP(C++)
-# --------------------
-m4_copy([_AX_ELPA_LANG_OPENMP(C)], [_AX_ELPA_LANG_OPENMP(C++)])
-
-# _AX_ELPA_LANG_OPENMP(Fortran 77)
-# ---------------------------
-m4_define([_AX_ELPA_LANG_OPENMP(Fortran 77)],
-[
-      program test_openmp
-       use omp_lib
-       implicit none
-!$     integer :: foobar
-       foobar = omp_get_num_threads()
-      end program
-])
-
-# _AX_ELPA_LANG_OPENMP(Fortran)
-# ---------------------------
-m4_copy([_AX_ELPA_LANG_OPENMP(Fortran 77)], [_AX_ELPA_LANG_OPENMP(Fortran)])
-
-# AC_ELPPA_OPENMP
-# ---------
-# Check which options need to be passed to the C compiler to support OpenMP.
-# Set the OPENMP_CFLAGS / OPENMP_CXXFLAGS / OPENMP_FFLAGS variable to these
-# options.
-# The options are necessary at compile time (so the #pragmas are understood)
-# and at link time (so the appropriate library is linked with).
-# This macro takes care to not produce redundant options if $CC $CFLAGS already
-# supports OpenMP. It also is careful to not pass options to compilers that
-# misinterpret them; for example, most compilers accept "-openmp" and create
-# an output file called 'penmp' rather than activating OpenMP support.
-AC_DEFUN([AX_ELPA_OPENMP],
-[
-  OPENMP_[]_AC_LANG_PREFIX[]FLAGS=
-  enable_openmp="yes"
-  if test "$enable_openmp" != no; then
-    AC_CACHE_CHECK([for _AC_LANG_ABBREV option to support OpenMP],
-      [ac_cv_prog_[]_AC_LANG_ABBREV[]_openmp],
-      [AC_LINK_IFELSE([AC_LANG_SOURCE([_AX_ELPA_LANG_OPENMP])],
-	 [ac_cv_prog_[]_AC_LANG_ABBREV[]_openmp='none needed'],
-	 [ac_cv_prog_[]_AC_LANG_ABBREV[]_openmp='unsupported'
-	  dnl Try these flags:
-	  dnl   GCC >= 4.2           -fopenmp
-	  dnl   SunPRO C             -xopenmp
-	  dnl   Intel C              -openmp
-	  dnl   SGI C, PGI C         -mp
-	  dnl   Tru64 Compaq C       -omp
-	  dnl   IBM C (AIX, Linux)   -qsmp=omp
-	  dnl If in this loop a compiler is passed an option that it doesn't
-	  dnl understand or that it misinterprets, the AC_LINK_IFELSE test
-	  dnl will fail (since we know that it failed without the option),
-	  dnl therefore the loop will continue searching for an option, and
-	  dnl no output file called 'penmp' or 'mp' is created.
-	  for ac_option in -openmp -fopenmp -xopenmp -mp -omp -qsmp=omp; do
-	    ac_save_[]_AC_LANG_PREFIX[]FLAGS=$[]_AC_LANG_PREFIX[]FLAGS
-	    _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $ac_option"
-	    AC_LINK_IFELSE([AC_LANG_SOURCE([_AX_ELPA_LANG_OPENMP])],
-	      [ac_cv_prog_[]_AC_LANG_ABBREV[]_openmp=$ac_option])
-	    _AC_LANG_PREFIX[]FLAGS=$ac_save_[]_AC_LANG_PREFIX[]FLAGS
-	    if test "$ac_cv_prog_[]_AC_LANG_ABBREV[]_openmp" != unsupported; then
-	      break
-	    fi
-	  done])])
-    case $ac_cv_prog_[]_AC_LANG_ABBREV[]_openmp in #(
-      "none needed" | unsupported)
-        ;; #(
-      *)
-        OPENMP_[]_AC_LANG_PREFIX[]FLAGS=$ac_cv_prog_[]_AC_LANG_ABBREV[]_openmp ;;
-    esac
-  fi
-  AC_SUBST([OPENMP_]_AC_LANG_PREFIX[FLAGS])
-])
diff --git a/m4/ax_elpa_specific_kernels.m4 b/m4/ax_elpa_specific_kernels.m4
deleted file mode 100644
index 556de2589..000000000
--- a/m4/ax_elpa_specific_kernels.m4
+++ /dev/null
@@ -1,132 +0,0 @@
-
-dnl macro for testing whether the user wanted to compile only with one
-dnl specific real kernel
-
-dnl usage: DEFINE_OPTION([real-generic-kernel-only],[generic-kernel],[with_real_generic_kernel],[install_real_generic])
-
-AC_DEFUN([DEFINE_OPTION_SPECIFIC_REAL_KERNEL],[
-  AC_ARG_WITH([$1],
-               AS_HELP_STRING([--with-$1],
-                              [only compile $2 for real case]),
-              [with_option=yes],[with_option=no])
-
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_real_kernel}" = x"no" ; then
-
-    dnl make sure that all the other kernels are unset
-    install_real_generic=no
-    install_real_generic_simple=no
-    install_real_sse=no
-    install_real_bgp=no
-    install_real_bgq=no
-    install_real_avx_block2=no
-    install_real_avx_block4=no
-    install_real_avx_block6=no
-    want_avx=no
-    install_gpu=no
-
-    use_specific_real_kernel=yes
-    dnl now set the specific kernel
-    $3=yes
-
-    dnl in case of SSE or AVX make sure that we can compile the choosen kernel
-    if test x"${install_real_sse}" = x"yes" ; then
-     if test x"${can_compile_sse}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     fi
-    fi
-
-    if test x"${install_real_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     else
-       want_avx=yes
-     fi
-    fi
-
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     else
-       want_avx=yes
-     fi
-    fi
-
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     else
-       want_avx=yes
-     fi
-    fi
-
-    AC_MSG_NOTICE([$1 will be the only compiled kernel for real case])
-    if test x"${want_gpu}" = x"yes" ; then
-      AC_MSG_WARN([At the moment this disables GPU support!])
-      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific real kernel])
-    fi
-   else
-    AC_MSG_FAILURE([$1 failed; A specific kernel for real case has already been defined before!])
-   fi
-  fi
-])
-
-
-AC_DEFUN([DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL],[
-  AC_ARG_WITH([$1],
-                 AS_HELP_STRING([--with-$1],
-                                [only compile $2 for complex case]),
-              [with_option=yes],[with_option=no])
-
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_complex_kernel}" = x"no" ; then
-
-    dnl make sure that all the other kernels are unset
-    install_complex_generic=no
-    install_complex_generic_simple=no
-    install_complex_sse=no
-    install_complex_bgp=no
-    install_complex_bgq=no
-    install_complex_avx_block1=no
-    install_complex_avx_block2=no
-    want_avx=no
-
-    install_gpu=no
-    use_specific_complex_kernel=yes
-    dnl now set the specific kernel
-    $3=yes
-
-    dnl in case of SSE or AVX make sure that we can compile the choosen kernel
-    if test x"${install_complex_sse}" = x"yes" ; then
-     if test x"${can_compile_sse}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     fi
-    fi
-
-    if test x"${install_complex_avx_block1}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     else
-       want_avx=yes
-     fi
-    fi
-
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     else
-       want_avx=yes
-     fi
-    fi
-
-    AC_MSG_NOTICE([$1 will be the only compiled kernel for real case])
-    if test x"${want_gpu}" = x"yes" ; then
-      AC_MSG_WARN([At the moment this disables GPU support!])
-      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific complex kernel])
-    fi
-   else
-    AC_MSG_FAILURE([$1 failed; A specific kernel for real case has already been defined before!])
-   fi
-  fi
-])
-
diff --git a/m4/ax_prog_cc_mpi.m4 b/m4/ax_prog_cc_mpi.m4
deleted file mode 100644
index be26d70a3..000000000
--- a/m4/ax_prog_cc_mpi.m4
+++ /dev/null
@@ -1,166 +0,0 @@
-# ===========================================================================
-#      http://www.gnu.org/software/autoconf-archive/ax_prog_cc_mpi.html
-# ===========================================================================
-#
-# SYNOPSIS
-#
-#   AX_PROG_CC_MPI([MPI-WANTED-TEST[, ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]])
-#
-# DESCRIPTION
-#
-#   This macro tries to find out how to compile C programs that use MPI
-#   (Message Passing Interface), a standard API for parallel process
-#   communication (see http://www-unix.mcs.anl.gov/mpi/). The macro has to
-#   be used instead of the standard macro AC_PROG_CC and will replace the
-#   standard variable CC with the found compiler.
-#
-#   MPI-WANTED-TEST is used to test whether MPI is actually wanted by the
-#   user. If MPI-WANTED_TEST is omitted or if it succeeds, the macro will
-#   try to find out how to use MPI, if it fails, the macro will call
-#   AC_PROG_CC to find a standard C compiler instead.
-#
-#   When MPI is found, ACTION-IF-FOUND will be executed, if MPI is not found
-#   (or MPI-WANTED-TEST fails) ACTION-IF-NOT-FOUND is executed. If
-#   ACTION-IF-FOUND is not set, the macro will define HAVE_MPI.
-#
-#   The following example demonstrates usage of the macro:
-#
-#     # If --with-mpi=auto is used, try to find MPI, but use standard C compiler if it is not found.
-#     # If --with-mpi=yes is used, try to find MPI and fail if it isn't found.
-#     # If --with-mpi=no is used, use a standard C compiler instead.
-#     AC_ARG_WITH(mpi, [AS_HELP_STRING([--with-mpi],
-#         [compile with MPI (parallelization) support. If none is found,
-#         MPI is not used. Default: auto])
-#     ],,[with_mpi=auto])
-#     #
-#     AX_PROG_CC_MPI([test x"$with_mpi" != xno],[use_mpi=yes],[
-#       use_mpi=no
-#       if test x"$with_mpi" = xyes; then
-#         AC_MSG_FAILURE([MPI compiler requested, but couldn't use MPI.])
-#       else
-#         AC_MSG_WARN([No MPI compiler found, won't use MPI.])
-#       fi
-#     ])
-#
-# LICENSE
-#
-#   Copyright (c) 2010,2011 Olaf Lenz <olenz@icp.uni-stuttgart.de>
-#
-#   This program is free software: you can redistribute it and/or modify it
-#   under the terms of the GNU General Public License as published by the
-#   Free Software Foundation, either version 3 of the License, or (at your
-#   option) any later version.
-#
-#   This program is distributed in the hope that it will be useful, but
-#   WITHOUT ANY WARRANTY; without even the implied warranty of
-#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-#   Public License for more details.
-#
-#   You should have received a copy of the GNU General Public License along
-#   with this program. If not, see <http://www.gnu.org/licenses/>.
-#
-#   As a special exception, the respective Autoconf Macro's copyright owner
-#   gives unlimited permission to copy, distribute and modify the configure
-#   scripts that are the output of Autoconf when processing the Macro. You
-#   need not follow the terms of the GNU General Public License when using
-#   or distributing such scripts, even though portions of the text of the
-#   Macro appear in them. The GNU General Public License (GPL) does govern
-#   all other use of the material that constitutes the Autoconf Macro.
-#
-#   This special exception to the GPL applies to versions of the Autoconf
-#   Macro released by the Autoconf Archive. When you make and distribute a
-#   modified version of the Autoconf Macro, you may extend this special
-#   exception to the GPL to apply to your modified version as well.
-
-#serial 1
-
-AC_DEFUN([AX_PROG_CC_MPI], [
-AC_PREREQ(2.50)
-
-# Check for compiler
-# Needs to be split off into an extra macro to ensure right expansion
-# order.
-AC_REQUIRE([_AX_PROG_CC_MPI],[_AX_PROG_CC_MPI([$1])])
-
-AS_IF([test x"$_ax_prog_cc_mpi_mpi_wanted" = xno],
-  [ _ax_prog_cc_mpi_mpi_found=no ],
-  [
-    AC_LANG_PUSH([C])
-    # test whether MPI_Init is available
-    # We do not use AC_SEARCH_LIBS here, as it caches its outcome and
-    # thus disallows corresponding calls in the other AX_PROG_*_MPI
-    # macros.
-    for lib in NONE mpi mpich; do
-      save_LIBS=$LIBS
-      if test x"$lib" = xNONE; then
-        AC_MSG_CHECKING([for function MPI_Init])
-      else
-        AC_MSG_CHECKING([for function MPI_Init in -l$lib])
-        LIBS="-l$lib $LIBS"
-      fi
-      AC_LINK_IFELSE([AC_LANG_CALL([],[MPI_Init])],
-        [ _ax_prog_cc_mpi_mpi_found=yes ],
-        [ _ax_prog_cc_mpi_mpi_found=no ])
-      AC_MSG_RESULT($_ax_prog_cc_mpi_mpi_found)
-      if test "x$_ax_prog_cc_mpi_mpi_found" = "xyes"; then
-        break;
-      fi
-      LIBS=$save_LIBS
-    done
-
-    # Check for header
-    AS_IF([test x"$_ax_prog_cc_mpi_mpi_found" = xyes], [
-      AC_MSG_CHECKING([for mpi.h])
-      AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include <mpi.h>])],
-        [ AC_MSG_RESULT(yes)],
-        [ AC_MSG_RESULT(no)
-         _ax_prog_cc_mpi_mpi_found=no
-      ])
-    ])
-    AC_LANG_POP([C])
-])
-
-# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
-AS_IF([test x"$_ax_prog_cc_mpi_mpi_found" = xyes], [
-        ifelse([$2],,[AC_DEFINE(HAVE_MPI,1,[Define if you have the MPI library.])],[$2])
-        :
-],[
-        $3
-        :
-])
-
-])dnl AX_PROG_CC_MPI
-
-dnl _AX_PROG_CC_MPI is an internal macro required by AX_PROG_CC_MPI.
-dnl To ensure the right expansion order, the main function AX_PROG_CC_MPI
-dnl has to be split into two parts.
-dnl
-dnl Known MPI C compilers:
-dnl  mpicc
-dnl  mpixlc_r
-dnl  mpixlc
-dnl  hcc
-dnl  mpxlc_r
-dnl  mpxlc
-dnl  sxmpicc  NEC SX
-dnl  mpifcc   Fujitsu
-dnl  mpgcc
-dnl  mpcc
-dnl  cmpicc
-dnl  cc
-dnl
-AC_DEFUN([_AX_PROG_CC_MPI], [
-  ifelse([$1],,[_ax_prog_cc_mpi_mpi_wanted=yes],[
-    AC_MSG_CHECKING([whether to compile using MPI])
-    if $1; then
-      _ax_prog_cc_mpi_mpi_wanted=yes
-    else
-      _ax_prog_cc_mpi_mpi_wanted=no
-    fi
-    AC_MSG_RESULT($_ax_prog_cc_mpi_mpi_wanted)
-  ])
-  if test x"$_ax_prog_cc_mpi_mpi_wanted" = xyes; then
-    AC_CHECK_TOOLS([CC], [mpicc mpixlc_r mpixlc hcc mpxlc_r mpxlc sxmpicc mpifcc mpgcc mpcc cmpicc cc gcc])
-  fi
-  AC_PROG_CC
-])dnl _AX_PROG_CC_MPI
diff --git a/m4/ax_prog_cxx_mpi.m4 b/m4/ax_prog_cxx_mpi.m4
deleted file mode 100644
index 988f6222e..000000000
--- a/m4/ax_prog_cxx_mpi.m4
+++ /dev/null
@@ -1,173 +0,0 @@
-# ===========================================================================
-#      http://www.gnu.org/software/autoconf-archive/ax_prog_cxx_mpi.html
-# ===========================================================================
-#
-# SYNOPSIS
-#
-#   AX_PROG_CXX_MPI([MPI-WANTED-TEST[, ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]])
-#
-# DESCRIPTION
-#
-#   This macro tries to find out how to compile C++ programs that use MPI
-#   (Message Passing Interface), a standard API for parallel process
-#   communication (see http://www-unix.mcs.anl.gov/mpi/).  The macro has to
-#   be used instead of the standard macro AC_PROG_CXX and will replace the
-#   standard variable CXX with the found compiler.
-#
-#   MPI-WANTED-TEST is used to test whether MPI is actually wanted by the
-#   user. If MPI-WANTED_TEST is omitted or if it succeeds, the macro will
-#   try to find out how to use MPI, if it fails, the macro will call
-#   AC_PROG_CC to find a standard C compiler instead.
-#
-#   When MPI is found, ACTION-IF-FOUND will be executed, if MPI is not found
-#   (or MPI-WANTED-TEST fails) ACTION-IF-NOT-FOUND is executed. If
-#   ACTION-IF-FOUND is not set, the macro will define HAVE_MPI.
-#
-#   The following example demonstrates usage of the macro:
-#
-#     # If --with-mpi=auto is used, try to find MPI, but use standard C compiler if it is not found.
-#     # If --with-mpi=yes is used, try to find MPI and fail if it isn't found.
-#     # If --with-mpi=no is used, use a standard C compiler instead.
-#     AC_ARG_WITH(mpi, [AS_HELP_STRING([--with-mpi],
-#         [compile with MPI (parallelization) support. If none is found,
-#         MPI is not used. Default: auto])
-#     ],,[with_mpi=auto])
-#
-#     AX_PROG_CXX_MPI([test x"$with_mpi" != xno],[use_mpi=yes],[
-#       use_mpi=no
-#       if test x"$with_mpi" = xyes; then
-#         AC_MSG_FAILURE([MPI compiler requested, but couldn't use MPI.])
-#       else
-#         AC_MSG_WARN([No MPI compiler found, won't use MPI.])
-#       fi
-#     ])
-#
-# LICENSE
-#
-#   Copyright (c) 2010,2011 Olaf Lenz <olenz@icp.uni-stuttgart.de>
-#
-#   This program is free software: you can redistribute it and/or modify it
-#   under the terms of the GNU General Public License as published by the
-#   Free Software Foundation, either version 3 of the License, or (at your
-#   option) any later version.
-#
-#   This program is distributed in the hope that it will be useful, but
-#   WITHOUT ANY WARRANTY; without even the implied warranty of
-#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-#   Public License for more details.
-#
-#   You should have received a copy of the GNU General Public License along
-#   with this program. If not, see <http://www.gnu.org/licenses/>.
-#
-#   As a special exception, the respective Autoconf Macro's copyright owner
-#   gives unlimited permission to copy, distribute and modify the configure
-#   scripts that are the output of Autoconf when processing the Macro. You
-#   need not follow the terms of the GNU General Public License when using
-#   or distributing such scripts, even though portions of the text of the
-#   Macro appear in them. The GNU General Public License (GPL) does govern
-#   all other use of the material that constitutes the Autoconf Macro.
-#
-#   This special exception to the GPL applies to versions of the Autoconf
-#   Macro released by the Autoconf Archive. When you make and distribute a
-#   modified version of the Autoconf Macro, you may extend this special
-#   exception to the GPL to apply to your modified version as well.
-
-#serial 2
-
-AC_DEFUN([AX_PROG_CXX_MPI], [
-AC_PREREQ(2.50)
-
-# Check for compiler
-# Needs to be split off into an extra macro to ensure right expansion
-# order.
-AC_REQUIRE([_AX_PROG_CXX_MPI],[_AX_PROG_CXX_MPI([$1])])
-
-AS_IF([test x"$_ax_prog_cxx_mpi_mpi_wanted" = xno],
-  [ _ax_prog_cxx_mpi_mpi_found=no ],
-  [
-    AC_LANG_PUSH([C++])
-
-    # test whether MPI_Init() is available
-    # We do not use AC_SEARCH_LIBS here, as it caches its outcome and
-    # thus disallows corresponding calls in the other AX_PROG_*_MPI
-    # macros.
-    for lib in NONE mpi mpich; do
-      save_LIBS=$LIBS
-      if test x"$lib" = xNONE; then
-        AC_MSG_CHECKING([for function MPI_Init])
-      else
-        AC_MSG_CHECKING([for function MPI_Init in -l$lib])
-        LIBS="-l$lib $LIBS"
-      fi
-      AC_LINK_IFELSE([
-        AC_LANG_PROGRAM([
-extern "C" { void MPI_Init(); }
-],[MPI_Init();])],
-        [ _ax_prog_cxx_mpi_mpi_found=yes ],
-        [ _ax_prog_cxx_mpi_mpi_found=no ])
-      AC_MSG_RESULT($_ax_prog_cxx_mpi_mpi_found)
-      if test "x$_ax_prog_cxx_mpi_mpi_found" = "xyes"; then
-        break;
-      fi
-      LIBS=$save_LIBS
-    done
-
-    # Check for header
-    AS_IF([test x"$_ax_prog_cxx_mpi_mpi_found" = xyes], [
-      AC_MSG_CHECKING([for mpi.h])
-      AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include <mpi.h>])],
-        [ AC_MSG_RESULT(yes)],
-        [ AC_MSG_RESULT(no)
-         _ax_prog_cxx_mpi_mpi_found=no
-      ])
-    ])
-    AC_LANG_POP([C++])
-])
-
-# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
-AS_IF([test x"$_ax_prog_cxx_mpi_mpi_found" = xyes], [
-        ifelse([$2],,[AC_DEFINE(HAVE_MPI,1,[Define if you have the MPI library.])],[$2])
-        :
-],[
-        $3
-        :
-])
-
-])dnl AX_PROG_CXX_MPI
-
-dnl _AX_PROG_CXX_MPI is an internal macro required by AX_PROG_CXX_MPI.
-dnl To ensure the right expansion order, the main function AX_PROG_CXX_MPI
-dnl has to be split into two parts.
-dnl
-dnl Known MPI C++ compilers:
-dnl  mpic++
-dnl  mpicxx
-dnl  mpiCC
-dnl  sxmpic++     NEC SX
-dnl  hcp
-dnl  mpxlC_r
-dnl  mpxlC
-dnl  mpixlcxx_r
-dnl  mpixlcxx
-dnl  mpg++
-dnl  mpc++
-dnl  mpCC
-dnl  cmpic++
-dnl  mpiFCC       Fujitsu
-dnl  CC
-dnl
-AC_DEFUN([_AX_PROG_CXX_MPI], [
-  ifelse([$1],,[_ax_prog_cxx_mpi_mpi_wanted=yes],[
-    AC_MSG_CHECKING([whether to compile using MPI])
-    if $1; then
-      _ax_prog_cxx_mpi_mpi_wanted=yes
-    else
-      _ax_prog_cxx_mpi_mpi_wanted=no
-    fi
-    AC_MSG_RESULT($_ax_prog_cxx_mpi_mpi_wanted)
-  ])
-  if test x"$_ax_prog_cxx_mpi_mpi_wanted" = xyes; then
-    AC_CHECK_TOOLS([CXX], [mpic++ mpicxx mpiCC sxmpic++ hcp mpxlC_r mpxlC mpixlcxx_r mpixlcxx mpg++ mpc++ mpCC cmpic++ mpiFCC CCicpc pgCC pathCC sxc++ xlC_r xlC bgxlC_r bgxlC openCC sunCC crayCC g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC])
-  fi
-  AC_PROG_CXX
-])dnl _AX_PROG_CXX_MPI
diff --git a/m4/ax_prog_doxygen.m4 b/m4/ax_prog_doxygen.m4
deleted file mode 100644
index 44b22b00a..000000000
--- a/m4/ax_prog_doxygen.m4
+++ /dev/null
@@ -1,532 +0,0 @@
-# ===========================================================================
-#      http://www.gnu.org/software/autoconf-archive/ax_prog_doxygen.html
-# ===========================================================================
-#
-# SYNOPSIS
-#
-#   DX_INIT_DOXYGEN(PROJECT-NAME, DOXYFILE-PATH, [OUTPUT-DIR])
-#   DX_DOXYGEN_FEATURE(ON|OFF)
-#   DX_DOT_FEATURE(ON|OFF)
-#   DX_HTML_FEATURE(ON|OFF)
-#   DX_CHM_FEATURE(ON|OFF)
-#   DX_CHI_FEATURE(ON|OFF)
-#   DX_MAN_FEATURE(ON|OFF)
-#   DX_RTF_FEATURE(ON|OFF)
-#   DX_XML_FEATURE(ON|OFF)
-#   DX_PDF_FEATURE(ON|OFF)
-#   DX_PS_FEATURE(ON|OFF)
-#
-# DESCRIPTION
-#
-#   The DX_*_FEATURE macros control the default setting for the given
-#   Doxygen feature. Supported features are 'DOXYGEN' itself, 'DOT' for
-#   generating graphics, 'HTML' for plain HTML, 'CHM' for compressed HTML
-#   help (for MS users), 'CHI' for generating a seperate .chi file by the
-#   .chm file, and 'MAN', 'RTF', 'XML', 'PDF' and 'PS' for the appropriate
-#   output formats. The environment variable DOXYGEN_PAPER_SIZE may be
-#   specified to override the default 'a4wide' paper size.
-#
-#   By default, HTML, PDF and PS documentation is generated as this seems to
-#   be the most popular and portable combination. MAN pages created by
-#   Doxygen are usually problematic, though by picking an appropriate subset
-#   and doing some massaging they might be better than nothing. CHM and RTF
-#   are specific for MS (note that you can't generate both HTML and CHM at
-#   the same time). The XML is rather useless unless you apply specialized
-#   post-processing to it.
-#
-#   The macros mainly control the default state of the feature. The use can
-#   override the default by specifying --enable or --disable. The macros
-#   ensure that contradictory flags are not given (e.g.,
-#   --enable-doxygen-html and --enable-doxygen-chm,
-#   --enable-doxygen-anything with --disable-doxygen, etc.) Finally, each
-#   feature will be automatically disabled (with a warning) if the required
-#   programs are missing.
-#
-#   Once all the feature defaults have been specified, call DX_INIT_DOXYGEN
-#   with the following parameters: a one-word name for the project for use
-#   as a filename base etc., an optional configuration file name (the
-#   default is 'Doxyfile', the same as Doxygen's default), and an optional
-#   output directory name (the default is 'doxygen-doc').
-#
-#   Automake Support
-#
-#   The following is a template aminclude.am file for use with Automake.
-#   Make targets and variables values are controlled by the various
-#   DX_COND_* conditionals set by autoconf.
-#
-#   The provided targets are:
-#
-#     doxygen-doc: Generate all doxygen documentation.
-#
-#     doxygen-run: Run doxygen, which will generate some of the
-#                  documentation (HTML, CHM, CHI, MAN, RTF, XML)
-#                  but will not do the post processing required
-#                  for the rest of it (PS, PDF, and some MAN).
-#
-#     doxygen-man: Rename some doxygen generated man pages.
-#
-#     doxygen-ps:  Generate doxygen PostScript documentation.
-#
-#     doxygen-pdf: Generate doxygen PDF documentation.
-#
-#   Note that by default these are not integrated into the automake targets.
-#   If doxygen is used to generate man pages, you can achieve this
-#   integration by setting man3_MANS to the list of man pages generated and
-#   then adding the dependency:
-#
-#     $(man3_MANS): doxygen-doc
-#
-#   This will cause make to run doxygen and generate all the documentation.
-#
-#   The following variable is intended for use in Makefile.am:
-#
-#     DX_CLEANFILES = everything to clean.
-#
-#   Then add this variable to MOSTLYCLEANFILES.
-#
-#     ----- begin aminclude.am -------------------------------------
-#
-#     ## --------------------------------- ##
-#     ## Format-independent Doxygen rules. ##
-#     ## --------------------------------- ##
-#
-#     if DX_COND_doc
-#
-#     ## ------------------------------- ##
-#     ## Rules specific for HTML output. ##
-#     ## ------------------------------- ##
-#
-#     if DX_COND_html
-#
-#     DX_CLEAN_HTML = @DX_DOCDIR@/html
-#
-#     endif DX_COND_html
-#
-#     ## ------------------------------ ##
-#     ## Rules specific for CHM output. ##
-#     ## ------------------------------ ##
-#
-#     if DX_COND_chm
-#
-#     DX_CLEAN_CHM = @DX_DOCDIR@/chm
-#
-#     if DX_COND_chi
-#
-#     DX_CLEAN_CHI = @DX_DOCDIR@/@PACKAGE@.chi
-#
-#     endif DX_COND_chi
-#
-#     endif DX_COND_chm
-#
-#     ## ------------------------------ ##
-#     ## Rules specific for MAN output. ##
-#     ## ------------------------------ ##
-#
-#     if DX_COND_man
-#
-#     DX_CLEAN_MAN = @DX_DOCDIR@/man
-#
-#     endif DX_COND_man
-#
-#     ## ------------------------------ ##
-#     ## Rules specific for RTF output. ##
-#     ## ------------------------------ ##
-#
-#     if DX_COND_rtf
-#
-#     DX_CLEAN_RTF = @DX_DOCDIR@/rtf
-#
-#     endif DX_COND_rtf
-#
-#     ## ------------------------------ ##
-#     ## Rules specific for XML output. ##
-#     ## ------------------------------ ##
-#
-#     if DX_COND_xml
-#
-#     DX_CLEAN_XML = @DX_DOCDIR@/xml
-#
-#     endif DX_COND_xml
-#
-#     ## ----------------------------- ##
-#     ## Rules specific for PS output. ##
-#     ## ----------------------------- ##
-#
-#     if DX_COND_ps
-#
-#     DX_CLEAN_PS = @DX_DOCDIR@/@PACKAGE@.ps
-#
-#     DX_PS_GOAL = doxygen-ps
-#
-#     doxygen-ps: @DX_DOCDIR@/@PACKAGE@.ps
-#
-#     @DX_DOCDIR@/@PACKAGE@.ps: @DX_DOCDIR@/@PACKAGE@.tag
-#         cd @DX_DOCDIR@/latex; \
-#         rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
-#         $(DX_LATEX) refman.tex; \
-#         $(MAKEINDEX_PATH) refman.idx; \
-#         $(DX_LATEX) refman.tex; \
-#         countdown=5; \
-#         while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \
-#                           refman.log > /dev/null 2>&1 \
-#            && test $$countdown -gt 0; do \
-#             $(DX_LATEX) refman.tex; \
-#             countdown=`expr $$countdown - 1`; \
-#         done; \
-#         $(DX_DVIPS) -o ../@PACKAGE@.ps refman.dvi
-#
-#     endif DX_COND_ps
-#
-#     ## ------------------------------ ##
-#     ## Rules specific for PDF output. ##
-#     ## ------------------------------ ##
-#
-#     if DX_COND_pdf
-#
-#     DX_CLEAN_PDF = @DX_DOCDIR@/@PACKAGE@.pdf
-#
-#     DX_PDF_GOAL = doxygen-pdf
-#
-#     doxygen-pdf: @DX_DOCDIR@/@PACKAGE@.pdf
-#
-#     @DX_DOCDIR@/@PACKAGE@.pdf: @DX_DOCDIR@/@PACKAGE@.tag
-#         cd @DX_DOCDIR@/latex; \
-#         rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
-#         $(DX_PDFLATEX) refman.tex; \
-#         $(DX_MAKEINDEX) refman.idx; \
-#         $(DX_PDFLATEX) refman.tex; \
-#         countdown=5; \
-#         while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \
-#                           refman.log > /dev/null 2>&1 \
-#            && test $$countdown -gt 0; do \
-#             $(DX_PDFLATEX) refman.tex; \
-#             countdown=`expr $$countdown - 1`; \
-#         done; \
-#         mv refman.pdf ../@PACKAGE@.pdf
-#
-#     endif DX_COND_pdf
-#
-#     ## ------------------------------------------------- ##
-#     ## Rules specific for LaTeX (shared for PS and PDF). ##
-#     ## ------------------------------------------------- ##
-#
-#     if DX_COND_latex
-#
-#     DX_CLEAN_LATEX = @DX_DOCDIR@/latex
-#
-#     endif DX_COND_latex
-#
-#     .PHONY: doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL)
-#
-#     .INTERMEDIATE: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL)
-#
-#     doxygen-run: @DX_DOCDIR@/@PACKAGE@.tag
-#
-#     doxygen-doc: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL)
-#
-#     @DX_DOCDIR@/@PACKAGE@.tag: $(DX_CONFIG) $(pkginclude_HEADERS)
-#         rm -rf @DX_DOCDIR@
-#         $(DX_ENV) $(DX_DOXYGEN) $(srcdir)/$(DX_CONFIG)
-#
-#     DX_CLEANFILES = \
-#         @DX_DOCDIR@/@PACKAGE@.tag \
-#         -r \
-#         $(DX_CLEAN_HTML) \
-#         $(DX_CLEAN_CHM) \
-#         $(DX_CLEAN_CHI) \
-#         $(DX_CLEAN_MAN) \
-#         $(DX_CLEAN_RTF) \
-#         $(DX_CLEAN_XML) \
-#         $(DX_CLEAN_PS) \
-#         $(DX_CLEAN_PDF) \
-#         $(DX_CLEAN_LATEX)
-#
-#     endif DX_COND_doc
-#
-#     ----- end aminclude.am ---------------------------------------
-#
-# LICENSE
-#
-#   Copyright (c) 2009 Oren Ben-Kiki <oren@ben-kiki.org>
-#
-#   Copying and distribution of this file, with or without modification, are
-#   permitted in any medium without royalty provided the copyright notice
-#   and this notice are preserved. This file is offered as-is, without any
-#   warranty.
-
-#serial 12
-
-## ----------##
-## Defaults. ##
-## ----------##
-
-DX_ENV=""
-AC_DEFUN([DX_FEATURE_doc],  ON)
-AC_DEFUN([DX_FEATURE_dot],  OFF)
-AC_DEFUN([DX_FEATURE_man],  OFF)
-AC_DEFUN([DX_FEATURE_html], ON)
-AC_DEFUN([DX_FEATURE_chm],  OFF)
-AC_DEFUN([DX_FEATURE_chi],  OFF)
-AC_DEFUN([DX_FEATURE_rtf],  OFF)
-AC_DEFUN([DX_FEATURE_xml],  OFF)
-AC_DEFUN([DX_FEATURE_pdf],  ON)
-AC_DEFUN([DX_FEATURE_ps],   ON)
-
-## --------------- ##
-## Private macros. ##
-## --------------- ##
-
-# DX_ENV_APPEND(VARIABLE, VALUE)
-# ------------------------------
-# Append VARIABLE="VALUE" to DX_ENV for invoking doxygen.
-AC_DEFUN([DX_ENV_APPEND], [AC_SUBST([DX_ENV], ["$DX_ENV $1='$2'"])])
-
-# DX_DIRNAME_EXPR
-# ---------------
-# Expand into a shell expression prints the directory part of a path.
-AC_DEFUN([DX_DIRNAME_EXPR],
-         [[expr ".$1" : '\(\.\)[^/]*$' \| "x$1" : 'x\(.*\)/[^/]*$']])
-
-# DX_IF_FEATURE(FEATURE, IF-ON, IF-OFF)
-# -------------------------------------
-# Expands according to the M4 (static) status of the feature.
-AC_DEFUN([DX_IF_FEATURE], [ifelse(DX_FEATURE_$1, ON, [$2], [$3])])
-
-# DX_REQUIRE_PROG(VARIABLE, PROGRAM)
-# ----------------------------------
-# Require the specified program to be found for the DX_CURRENT_FEATURE to work.
-AC_DEFUN([DX_REQUIRE_PROG], [
-AC_PATH_TOOL([$1], [$2])
-if test "$DX_FLAG_[]DX_CURRENT_FEATURE$$1" = 1; then
-    AC_MSG_WARN([$2 not found - will not DX_CURRENT_DESCRIPTION])
-    AC_SUBST(DX_FLAG_[]DX_CURRENT_FEATURE, 0)
-fi
-])
-
-# DX_TEST_FEATURE(FEATURE)
-# ------------------------
-# Expand to a shell expression testing whether the feature is active.
-AC_DEFUN([DX_TEST_FEATURE], [test "$DX_FLAG_$1" = 1])
-
-# DX_CHECK_DEPEND(REQUIRED_FEATURE, REQUIRED_STATE)
-# -------------------------------------------------
-# Verify that a required features has the right state before trying to turn on
-# the DX_CURRENT_FEATURE.
-AC_DEFUN([DX_CHECK_DEPEND], [
-test "$DX_FLAG_$1" = "$2" \
-|| AC_MSG_ERROR([doxygen-DX_CURRENT_FEATURE ifelse([$2], 1,
-                            requires, contradicts) doxygen-DX_CURRENT_FEATURE])
-])
-
-# DX_CLEAR_DEPEND(FEATURE, REQUIRED_FEATURE, REQUIRED_STATE)
-# ----------------------------------------------------------
-# Turn off the DX_CURRENT_FEATURE if the required feature is off.
-AC_DEFUN([DX_CLEAR_DEPEND], [
-test "$DX_FLAG_$1" = "$2" || AC_SUBST(DX_FLAG_[]DX_CURRENT_FEATURE, 0)
-])
-
-# DX_FEATURE_ARG(FEATURE, DESCRIPTION,
-#                CHECK_DEPEND, CLEAR_DEPEND,
-#                REQUIRE, DO-IF-ON, DO-IF-OFF)
-# --------------------------------------------
-# Parse the command-line option controlling a feature. CHECK_DEPEND is called
-# if the user explicitly turns the feature on (and invokes DX_CHECK_DEPEND),
-# otherwise CLEAR_DEPEND is called to turn off the default state if a required
-# feature is disabled (using DX_CLEAR_DEPEND). REQUIRE performs additional
-# requirement tests (DX_REQUIRE_PROG). Finally, an automake flag is set and
-# DO-IF-ON or DO-IF-OFF are called according to the final state of the feature.
-AC_DEFUN([DX_ARG_ABLE], [
-    AC_DEFUN([DX_CURRENT_FEATURE], [$1])
-    AC_DEFUN([DX_CURRENT_DESCRIPTION], [$2])
-    AC_ARG_ENABLE(doxygen-$1,
-                  [AS_HELP_STRING(DX_IF_FEATURE([$1], [--disable-doxygen-$1],
-                                                      [--enable-doxygen-$1]),
-                                  DX_IF_FEATURE([$1], [don't $2], [$2]))],
-                  [
-case "$enableval" in
-#(
-y|Y|yes|Yes|YES)
-    AC_SUBST([DX_FLAG_$1], 1)
-    $3
-;; #(
-n|N|no|No|NO)
-    AC_SUBST([DX_FLAG_$1], 0)
-;; #(
-*)
-    AC_MSG_ERROR([invalid value '$enableval' given to doxygen-$1])
-;;
-esac
-], [
-AC_SUBST([DX_FLAG_$1], [DX_IF_FEATURE([$1], 1, 0)])
-$4
-])
-if DX_TEST_FEATURE([$1]); then
-    $5
-    :
-fi
-AM_CONDITIONAL(DX_COND_$1, DX_TEST_FEATURE([$1]))
-if DX_TEST_FEATURE([$1]); then
-    $6
-    :
-else
-    $7
-    :
-fi
-])
-
-## -------------- ##
-## Public macros. ##
-## -------------- ##
-
-# DX_XXX_FEATURE(DEFAULT_STATE)
-# -----------------------------
-AC_DEFUN([DX_DOXYGEN_FEATURE], [AC_DEFUN([DX_FEATURE_doc],  [$1])])
-AC_DEFUN([DX_DOT_FEATURE],     [AC_DEFUN([DX_FEATURE_dot], [$1])])
-AC_DEFUN([DX_MAN_FEATURE],     [AC_DEFUN([DX_FEATURE_man],  [$1])])
-AC_DEFUN([DX_HTML_FEATURE],    [AC_DEFUN([DX_FEATURE_html], [$1])])
-AC_DEFUN([DX_CHM_FEATURE],     [AC_DEFUN([DX_FEATURE_chm],  [$1])])
-AC_DEFUN([DX_CHI_FEATURE],     [AC_DEFUN([DX_FEATURE_chi],  [$1])])
-AC_DEFUN([DX_RTF_FEATURE],     [AC_DEFUN([DX_FEATURE_rtf],  [$1])])
-AC_DEFUN([DX_XML_FEATURE],     [AC_DEFUN([DX_FEATURE_xml],  [$1])])
-AC_DEFUN([DX_XML_FEATURE],     [AC_DEFUN([DX_FEATURE_xml],  [$1])])
-AC_DEFUN([DX_PDF_FEATURE],     [AC_DEFUN([DX_FEATURE_pdf],  [$1])])
-AC_DEFUN([DX_PS_FEATURE],      [AC_DEFUN([DX_FEATURE_ps],   [$1])])
-
-# DX_INIT_DOXYGEN(PROJECT, [CONFIG-FILE], [OUTPUT-DOC-DIR])
-# ---------------------------------------------------------
-# PROJECT also serves as the base name for the documentation files.
-# The default CONFIG-FILE is "Doxyfile" and OUTPUT-DOC-DIR is "doxygen-doc".
-AC_DEFUN([DX_INIT_DOXYGEN], [
-
-# Files:
-AC_SUBST([DX_PROJECT], [$1])
-AC_SUBST([DX_CONFIG], [ifelse([$2], [], Doxyfile, [$2])])
-AC_SUBST([DX_DOCDIR], [ifelse([$3], [], doxygen-doc, [$3])])
-
-# Environment variables used inside doxygen.cfg:
-DX_ENV_APPEND(SRCDIR, $srcdir)
-DX_ENV_APPEND(PROJECT, $DX_PROJECT)
-DX_ENV_APPEND(DOCDIR, $DX_DOCDIR)
-DX_ENV_APPEND(VERSION, $PACKAGE_VERSION)
-
-# Doxygen itself:
-DX_ARG_ABLE(doc, [generate any doxygen documentation],
-            [],
-            [],
-            [DX_REQUIRE_PROG([DX_DOXYGEN], doxygen)
-             DX_REQUIRE_PROG([DX_PERL], perl)],
-            [DX_ENV_APPEND(PERL_PATH, $DX_PERL)])
-
-# Dot for graphics:
-DX_ARG_ABLE(dot, [generate graphics for doxygen documentation],
-            [DX_CHECK_DEPEND(doc, 1)],
-            [DX_CLEAR_DEPEND(doc, 1)],
-            [DX_REQUIRE_PROG([DX_DOT], dot)],
-            [DX_ENV_APPEND(HAVE_DOT, YES)
-             DX_ENV_APPEND(DOT_PATH, [`DX_DIRNAME_EXPR($DX_DOT)`])],
-            [DX_ENV_APPEND(HAVE_DOT, NO)])
-
-# Man pages generation:
-DX_ARG_ABLE(man, [generate doxygen manual pages],
-            [DX_CHECK_DEPEND(doc, 1)],
-            [DX_CLEAR_DEPEND(doc, 1)],
-            [],
-            [DX_ENV_APPEND(GENERATE_MAN, YES)],
-            [DX_ENV_APPEND(GENERATE_MAN, NO)])
-
-# RTF file generation:
-DX_ARG_ABLE(rtf, [generate doxygen RTF documentation],
-            [DX_CHECK_DEPEND(doc, 1)],
-            [DX_CLEAR_DEPEND(doc, 1)],
-            [],
-            [DX_ENV_APPEND(GENERATE_RTF, YES)],
-            [DX_ENV_APPEND(GENERATE_RTF, NO)])
-
-# XML file generation:
-DX_ARG_ABLE(xml, [generate doxygen XML documentation],
-            [DX_CHECK_DEPEND(doc, 1)],
-            [DX_CLEAR_DEPEND(doc, 1)],
-            [],
-            [DX_ENV_APPEND(GENERATE_XML, YES)],
-            [DX_ENV_APPEND(GENERATE_XML, NO)])
-
-# (Compressed) HTML help generation:
-DX_ARG_ABLE(chm, [generate doxygen compressed HTML help documentation],
-            [DX_CHECK_DEPEND(doc, 1)],
-            [DX_CLEAR_DEPEND(doc, 1)],
-            [DX_REQUIRE_PROG([DX_HHC], hhc)],
-            [DX_ENV_APPEND(HHC_PATH, $DX_HHC)
-             DX_ENV_APPEND(GENERATE_HTML, YES)
-             DX_ENV_APPEND(GENERATE_HTMLHELP, YES)],
-            [DX_ENV_APPEND(GENERATE_HTMLHELP, NO)])
-
-# Seperate CHI file generation.
-DX_ARG_ABLE(chi, [generate doxygen seperate compressed HTML help index file],
-            [DX_CHECK_DEPEND(chm, 1)],
-            [DX_CLEAR_DEPEND(chm, 1)],
-            [],
-            [DX_ENV_APPEND(GENERATE_CHI, YES)],
-            [DX_ENV_APPEND(GENERATE_CHI, NO)])
-
-# Plain HTML pages generation:
-DX_ARG_ABLE(html, [generate doxygen plain HTML documentation],
-            [DX_CHECK_DEPEND(doc, 1) DX_CHECK_DEPEND(chm, 0)],
-            [DX_CLEAR_DEPEND(doc, 1) DX_CLEAR_DEPEND(chm, 0)],
-            [],
-            [DX_ENV_APPEND(GENERATE_HTML, YES)],
-            [DX_TEST_FEATURE(chm) || DX_ENV_APPEND(GENERATE_HTML, NO)])
-
-# PostScript file generation:
-DX_ARG_ABLE(ps, [generate doxygen PostScript documentation],
-            [DX_CHECK_DEPEND(doc, 1)],
-            [DX_CLEAR_DEPEND(doc, 1)],
-            [DX_REQUIRE_PROG([DX_LATEX], latex)
-             DX_REQUIRE_PROG([DX_MAKEINDEX], makeindex)
-             DX_REQUIRE_PROG([DX_DVIPS], dvips)
-             DX_REQUIRE_PROG([DX_EGREP], egrep)])
-
-# PDF file generation:
-DX_ARG_ABLE(pdf, [generate doxygen PDF documentation],
-            [DX_CHECK_DEPEND(doc, 1)],
-            [DX_CLEAR_DEPEND(doc, 1)],
-            [DX_REQUIRE_PROG([DX_PDFLATEX], pdflatex)
-             DX_REQUIRE_PROG([DX_MAKEINDEX], makeindex)
-             DX_REQUIRE_PROG([DX_EGREP], egrep)])
-
-# LaTeX generation for PS and/or PDF:
-AM_CONDITIONAL(DX_COND_latex, DX_TEST_FEATURE(ps) || DX_TEST_FEATURE(pdf))
-if DX_TEST_FEATURE(ps) || DX_TEST_FEATURE(pdf); then
-    DX_ENV_APPEND(GENERATE_LATEX, YES)
-else
-    DX_ENV_APPEND(GENERATE_LATEX, NO)
-fi
-
-# Paper size for PS and/or PDF:
-AC_ARG_VAR(DOXYGEN_PAPER_SIZE,
-           [a4wide (default), a4, letter, legal or executive])
-case "$DOXYGEN_PAPER_SIZE" in
-#(
-"")
-    AC_SUBST(DOXYGEN_PAPER_SIZE, "")
-;; #(
-a4wide|a4|letter|legal|executive)
-    DX_ENV_APPEND(PAPER_SIZE, $DOXYGEN_PAPER_SIZE)
-;; #(
-*)
-    AC_MSG_ERROR([unknown DOXYGEN_PAPER_SIZE='$DOXYGEN_PAPER_SIZE'])
-;;
-esac
-
-#For debugging:
-#echo DX_FLAG_doc=$DX_FLAG_doc
-#echo DX_FLAG_dot=$DX_FLAG_dot
-#echo DX_FLAG_man=$DX_FLAG_man
-#echo DX_FLAG_html=$DX_FLAG_html
-#echo DX_FLAG_chm=$DX_FLAG_chm
-#echo DX_FLAG_chi=$DX_FLAG_chi
-#echo DX_FLAG_rtf=$DX_FLAG_rtf
-#echo DX_FLAG_xml=$DX_FLAG_xml
-#echo DX_FLAG_pdf=$DX_FLAG_pdf
-#echo DX_FLAG_ps=$DX_FLAG_ps
-#echo DX_ENV=$DX_ENV
-])
diff --git a/m4/ax_prog_fc_mpi.m4 b/m4/ax_prog_fc_mpi.m4
deleted file mode 100644
index 2e1b33cb3..000000000
--- a/m4/ax_prog_fc_mpi.m4
+++ /dev/null
@@ -1,157 +0,0 @@
-# ===========================================================================
-#      http://www.gnu.org/software/autoconf-archive/ax_prog_fc_mpi.html
-# ===========================================================================
-#
-# SYNOPSIS
-#
-#   AX_PROG_FC_MPI([MPI-WANTED-TEST[, ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]])
-#
-# DESCRIPTION
-#
-#   This macro tries to find out how to compile Fortran77 programs that use
-#   MPI (Message Passing Interface), a standard API for parallel process
-#   communication (see http://www-unix.mcs.anl.gov/mpi/).  The macro has to
-#   be used instead of the standard macro AC_PROG_FC and will replace the
-#   standard variable FC with the found compiler.
-#
-#   MPI-WANTED-TEST is used to test whether MPI is actually wanted by the
-#   user. If MPI-WANTED_TEST is omitted or if it succeeds, the macro will
-#   try to find out how to use MPI, if it fails, the macro will call
-#   AC_PROG_CC to find a standard C compiler instead.
-#
-#   When MPI is found, ACTION-IF-FOUND will be executed, if MPI is not found
-#   (or MPI-WANTED-TEST fails) ACTION-IF-NOT-FOUND is executed. If
-#   ACTION-IF-FOUND is not set, the macro will define HAVE_MPI.
-#
-#   The following example demonstrates usage of the macro:
-#
-#     # If --with-mpi=auto is used, try to find MPI, but use standard FC compiler if it is not found.
-#     # If --with-mpi=yes is used, try to find MPI and fail if it isn't found.
-#     # If --with-mpi=no is used, use a standard FC compiler instead.
-#     AC_ARG_WITH(mpi, [AS_HELP_STRING([--with-mpi],
-#         [compile with MPI (parallelization) support. If none is found,
-#         MPI is not used. Default: auto])
-#     ],,[with_mpi=auto])
-#
-#     AX_PROG_FC_MPI([test x"$with_mpi" != xno],[use_mpi=yes],[
-#       use_mpi=no
-#       if test x"$with_mpi" = xyes; then
-#         AC_MSG_FAILURE([MPI compiler requested, but couldn't use MPI.])
-#       else
-#         AC_MSG_WARN([No MPI compiler found, won't use MPI.])
-#       fi
-#     ])
-#
-# LICENSE
-#
-#   Copyright (c) 2010,2011 Olaf Lenz <olenz@icp.uni-stuttgart.de>
-#
-#   This program is free software: you can redistribute it and/or modify it
-#   under the terms of the GNU General Public License as published by the
-#   Free Software Foundation, either version 3 of the License, or (at your
-#   option) any later version.
-#
-#   This program is distributed in the hope that it will be useful, but
-#   WITHOUT ANY WARRANTY; without even the implied warranty of
-#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-#   Public License for more details.
-#
-#   You should have received a copy of the GNU General Public License along
-#   with this program. If not, see <http://www.gnu.org/licenses/>.
-#
-#   As a special exception, the respective Autoconf Macro's copyright owner
-#   gives unlimited permission to copy, distribute and modify the configure
-#   scripts that are the output of Autoconf when processing the Macro. You
-#   need not follow the terms of the GNU General Public License when using
-#   or distributing such scripts, even though portions of the text of the
-#   Macro appear in them. The GNU General Public License (GPL) does govern
-#   all other use of the material that constitutes the Autoconf Macro.
-#
-#   This special exception to the GPL applies to versions of the Autoconf
-#   Macro released by the Autoconf Archive. When you make and distribute a
-#   modified version of the Autoconf Macro, you may extend this special
-#   exception to the GPL to apply to your modified version as well.
-
-#serial 2
-
-AC_DEFUN([AX_PROG_FC_MPI], [
-AC_PREREQ(2.50)
-
-# Check for compiler
-# Needs to be split off into an extra macro to ensure right expansion
-# order.
-AC_REQUIRE([_AX_PROG_FC_MPI],[_AX_PROG_FC_MPI([$1])])
-
-AS_IF([test x"$_ax_prog_fc_mpi_mpi_wanted" = xno],
-  [ _ax_prog_fc_mpi_mpi_found=no ],
-  [
-    AC_LANG_PUSH([Fortran])
-
-    # test whether MPI_INIT is available
-    # We do not use AC_SEARCH_LIBS here, as it caches its outcome and
-    # thus disallows corresponding calls in the other AX_PROG_*_MPI
-    # macros.
-    for lib in NONE mpichf90 fmpi fmpich; do
-      save_LIBS=$LIBS
-      if test x"$lib" = xNONE; then
-        AC_MSG_CHECKING([for function MPI_INIT])
-      else
-        AC_MSG_CHECKING([for function MPI_INIT in -l$lib])
-        LIBS="-l$lib $LIBS"
-      fi
-      AC_LINK_IFELSE([AC_LANG_CALL([],[MPI_INIT])],
-        [ _ax_prog_fc_mpi_mpi_found=yes ],
-        [ _ax_prog_fc_mpi_mpi_found=no ])
-      AC_MSG_RESULT($_ax_prog_fc_mpi_mpi_found)
-      if test "x$_ax_prog_fc_mpi_mpi_found" = "xyes"; then
-        break;
-      fi
-      LIBS=$save_LIBS
-    done
-
-    # Check for header
-    AS_IF([test x"$_ax_prog_fc_mpi_mpi_found" = xyes], [
-      AC_MSG_CHECKING([for mpif.h])
-      AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[[
-      include 'mpif.h'
-]])],
-        [ AC_MSG_RESULT(yes)],
-        [ AC_MSG_RESULT(no)
-	  _ax_prog_fc_mpi_mpi_found=no
-      ])
-    ])
-    AC_LANG_POP([Fortran])
-])
-
-# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
-AS_IF([test x"$_ax_prog_fc_mpi_mpi_found" = xyes], [
-        ifelse([$2],,[AC_DEFINE(HAVE_MPI,1,[Define if you have the MPI library.])],[$2])
-        :
-],[
-        $3
-        :
-])
-
-])dnl AX_PROG_FC_MPI
-
-dnl _AX_PROG_FC_MPI is an internal macro required by AX_PROG_FC_MPI.
-dnl To ensure the right expansion order, the main function AX_PROG_FC_MPI
-dnl has to be split into two parts. This part looks for the MPI
-dnl compiler, while the other one tests whether an MPI program can be
-dnl compiled.
-dnl
-AC_DEFUN([_AX_PROG_FC_MPI], [
-  ifelse([$1],,[_ax_prog_fc_mpi_mpi_wanted=yes],[
-    AC_MSG_CHECKING([whether to compile using MPI])
-    if $1; then
-      _ax_prog_fc_mpi_mpi_wanted=yes
-    else
-      _ax_prog_fc_mpi_mpi_wanted=no
-    fi
-    AC_MSG_RESULT($_ax_prog_fc_mpi_mpi_wanted)
-  ])
-  if test x"$_ax_prog_fc_mpi_mpi_wanted" = xyes; then
-    AC_CHECK_TOOLS([FC], [mpiifort mpifort mpif95 mpxlf95_r mpxlf95 ftn mpif90 mpxlf90_r mpxlf90 mpf90 cmpif90c sxmpif90 mpif77 hf77 mpxlf_r mpxlf mpifrt mpf77 cmpifc xlf95 pgf95 pathf95 ifort g95 f95 fort ifc efc openf95 sunf95 crayftn gfortran lf95 ftn xlf90 f90 pgf90 pghpf pathf90 epcf90 sxf90 openf90 sunf90 xlf f77 frt pgf77 pathf77 g77 cf77 fort77 fl32 af77])
-  fi
-  AC_PROG_FC
-])dnl _AX_PROG_FC_MPI
diff --git a/man/get_elpa_communicators.3 b/man/get_elpa_communicators.3
deleted file mode 100644
index d60429bef..000000000
--- a/man/get_elpa_communicators.3
+++ /dev/null
@@ -1,59 +0,0 @@
-.TH "get_elpa_communicators" 3 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*-
-.ad l
-.nh
-.SH NAME
-get_elpa_communicators \- get the MPI row and column communicators needed in ELPA
-.br
-
-.SH SYNOPSIS
-.br
-.SS FORTRAN INTERFACE
-use elpa1
-
-.br
-.RI "success = \fBget_elpa_communicators\fP (mpi_comm_global, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols)"
-
-.br
-.br
-.RI "integer, intent(in)   \fBmpi_comm_global\fP:  global communicator for the calculation"
-.br
-.RI "integer, intent(in)   \fBmy_prow\fP:          row coordinate of the calling process in the process grid"
-.br
-.RI "integer, intent(in)   \fBmy_pcol\fP:          column coordinate of the calling process in the process grid"
-.br
-.RI "integer, intent(out)  \fBmpi_comm_row\fP:     communicator for communication within rows of processes"
-.br
-.RI "integer, intent(out)  \fBmpi_comm_row\fP:     communicator for communication within columns of processes"
-.br
-
-.RI "integer               \fBsuccess\fP:          return value indicating success or failure of the underlying MPI_COMM_SPLIT function"
-
-.SS C INTERFACE
-#include "elpa_generated.h"
-
-.br
-.RI "success = \fBget_elpa_communicators\fP (int mpi_comm_world, int my_prow, my_pcol, int *mpi_comm_rows, int *Pmpi_comm_cols);"
-
-.br
-.br
-.RI "int \fBmpi_comm_global\fP:  global communicator for the calculation"
-.br
-.RI "int \fBmy_prow\fP:          row coordinate of the calling process in the process grid"
-.br
-.RI "int \fBmy_pcol\fP:          column coordinate of the calling process in the process grid"
-.br
-.RI "int *\fBmpi_comm_row\fP:    pointer to the communicator for communication within rows of processes"
-.br
-.RI "int *\fBmpi_comm_row\fP:    pointer to the communicator for communication within columns of processes"
-.br
-
-.RI "int  \fBsuccess\fP:         return value indicating success or failure of the underlying MPI_COMM_SPLIT function"
-
-
-
-
-.SH DESCRIPTION
-All ELPA routines need MPI communicators for communicating within rows or columns of processes. These communicators are created from the \fBmpi_comm_global\fP communicator. It is assumed that the matrix used in ELPA is distributed with \fBmy_prow\fP rows and \fBmy_pcol\fP columns on the calling process. This function has to be envoked by all involved processes before any other calls to ELPA routines.
-.br
-.SH "SEE ALSO"
-\fBsolve_evp_real\fP(3) \fBsolve_evp_complex\fP(3) \fBsolve_evp_real_2stage\fP(3) \fBsolve_evp_complex_2stage\fP(3) \fBprint_available_elpa2_kernels\fP(1)
diff --git a/man/get_elpa_row_col_comms.3 b/man/get_elpa_row_col_comms.3
deleted file mode 100644
index ebae482e0..000000000
--- a/man/get_elpa_row_col_comms.3
+++ /dev/null
@@ -1,61 +0,0 @@
-.TH "get_elpa_row_col_comms" 3 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*-
-.ad l
-.nh
-.SH NAME
-get_elpa_row_col_comms \- old, deprecated interface to get the MPI row and column communicators needed in ELPA.
-It is recommended to use \fBget_elpa_communicators\fP(3)
-.br
-
-.SH SYNOPSIS
-.br
-.SS FORTRAN INTERFACE
-use elpa1
-
-.br
-.RI "success = \fBget_elpa_row_col_comms\fP (mpi_comm_global, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols)"
-
-.br
-.br
-.RI "integer, intent(in)   \fBmpi_comm_global\fP:  global communicator for the calculation"
-.br
-.RI "integer, intent(in)   \fBmy_prow\fP:          row coordinate of the calling process in the process grid"
-.br
-.RI "integer, intent(in)   \fBmy_pcol\fP:          column coordinate of the calling process in the process grid"
-.br
-.RI "integer, intent(out)  \fBmpi_comm_row\fP:     communicator for communication within rows of processes"
-.br
-.RI "integer, intent(out)  \fBmpi_comm_row\fP:     communicator for communication within columns of processes"
-.br
-
-.RI "integer               \fBsuccess\fP:          return value indicating success or failure of the underlying MPI_COMM_SPLIT function"
-
-.SS C INTERFACE
-#include "elpa_generated.h"
-
-.br
-.RI "success = \fBelpa_get_communicators\fP (int mpi_comm_world, int my_prow, my_pcol, int *mpi_comm_rows, int *Pmpi_comm_cols);"
-
-.br
-.br
-.RI "int \fBmpi_comm_global\fP:  global communicator for the calculation"
-.br
-.RI "int \fBmy_prow\fP:          row coordinate of the calling process in the process grid"
-.br
-.RI "int \fBmy_pcol\fP:          column coordinate of the calling process in the process grid"
-.br
-.RI "int *\fBmpi_comm_row\fP:    pointer to the communicator for communication within rows of processes"
-.br
-.RI "int *\fBmpi_comm_row\fP:    pointer to the communicator for communication within columns of processes"
-.br
-
-.RI "int  \fBsuccess\fP:         return value indicating success or failure of the underlying MPI_COMM_SPLIT function"
-
-
-
-
-
-.SH DESCRIPTION
-All ELPA routines need MPI communicators for communicating within rows or columns of processes. These communicators are created from the \fBmpi_comm_global\fP communicator. It is assumed that the matrix used in ELPA is distributed with \fBmy_prow\fP rows and \fBmy_pcol\fP columns on the calling process. This function has to be envoked by all involved processes before any other calls to ELPA routines.
-.br
-.SH "SEE ALSO"
-\fBget_elpa_communicators\fP(3) \fBsolve_evp_real\fP(3) \fBsolve_evp_complex\fP(3) \fBsolve_evp_real_2stage\fP(3) \fBsolve_evp_complex_2stage\fP(3) \fBprint_available_elpa2_kernels\fP(1)
diff --git a/man/print_available_elpa2_kernels.1 b/man/print_available_elpa2_kernels.1
deleted file mode 100644
index 36fdc61d5..000000000
--- a/man/print_available_elpa2_kernels.1
+++ /dev/null
@@ -1,27 +0,0 @@
-.TH "print_available_elpa2_kernels" 1 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*-
-.ad l
-.nh
-.SH NAME
-print_available_elpa2_kernels \- Provide information which ELPA2 kernels are available on this system\&.
-
-.SH SYNOPSIS
-.br
-print_available_elpa2_kernels
-.br
-
-.SH "Description"
-.PP
-Provide information which ELPA2 kernels are available on this system.
-.br
-It is possible to configure ELPA2 such, that different compute intensive 'ELPA2 kernels' can be choosen at runtime. The service binary print_available_elpa2_kernels will query the library and tell whether ELPA2 has been configured in this way, and if this is the case which kernels can be choosen at runtime. It will furthermore detail whether ELPA has been configured with OpenMP support.
-.SH "Options"
-.PP
-.br
-none
-.SH "Author"
-A. Marek, MPCDF
-.SH "Reporting bugs"
-Report bugs to the ELPA mail elpa-library@mpcdf.mpg.de
-.SH "SEE ALSO"
-\fBget_elpa_communicators\fP(3) \fBsolve_evp_real\fP(3) \fBsolve_evp_complex\fP(3) \fBsolve_evp_real_2stage\fP(3) \fBsolve_evp_complex_2stage\fP(3)
-
diff --git a/man/solve_evp_complex.3 b/man/solve_evp_complex.3
deleted file mode 100644
index 3984ca0b1..000000000
--- a/man/solve_evp_complex.3
+++ /dev/null
@@ -1,51 +0,0 @@
-.TH "solve_evp_complex" 3 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*-
-.ad l
-.nh
-.SH NAME
-solve_evp_complex \- solve the complex eigenvalue problem with the 1-stage ELPA solver.
-This interface is old and deprecated. It is recommended to use \fBsolve_evp_complex_1stage\fP(3)
-.br
-
-.SH SYNOPSIS
-.br
-.SS FORTRAN INTERFACE
-use elpa1
-.br
-.br
-.RI  "success = \fBsolve_evp_complex\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "integer,     intent(in)    \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "integer,     intent(in)    \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "complex*16,  intent(inout) \fBa\fP:             locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "integer,     intent(in)    \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "real*8,      intent(inout) \fBev\fP:            on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "complex*16,  intent(inout) \fBq\fP:             on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "integer,     intent(in)    \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "integer,     intent(in)    \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "integer,     intent(in)    \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "integer,     intent(in)    \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "integer, intent(in)        \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-
-.RI "logical                    \fBsuccess\fP:       return value indicating success or failure"
-.br
-.SH DESCRIPTION
-Solve the complex eigenvalue problem with the 1-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver.
-.br
-.SH "SEE ALSO"
-\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage\fP(3) \fBsolve_evp_real_2stage\fP(3) \fBsolve_evp_complex_2stage\fP(3) \fBprint_available_elpa2_kernels\fP(1)
diff --git a/man/solve_evp_complex_1stage.3 b/man/solve_evp_complex_1stage.3
deleted file mode 100644
index a859d20fc..000000000
--- a/man/solve_evp_complex_1stage.3
+++ /dev/null
@@ -1,88 +0,0 @@
-.TH "solve_evp_complex_1stage" 3 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*-
-.ad l
-.nh
-.SH NAME
-solve_evp_complex_1stage \- solve the complex eigenvalue problem with the 1-stage ELPA solver
-.br
-
-.SH SYNOPSIS
-.br
-.SS FORTRAN INTERFACE
-use elpa1
-.br
-.br
-.RI  "success = \fBsolve_evp_complex_1stage\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "integer,     intent(in)    \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "integer,     intent(in)    \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "complex*16,  intent(inout) \fBa\fP:             locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "integer,     intent(in)    \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "real*8,      intent(inout) \fBev\fP:            on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "complex*16,  intent(inout) \fBq\fP:             on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "integer,     intent(in)    \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "integer,     intent(in)    \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "integer,     intent(in)    \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "integer,     intent(in)    \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "integer, intent(in)        \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-
-.RI "logical                    \fBsuccess\fP:       return value indicating success or failure"
-.br
-.SS C INTERFACE
-#include "elpa.h"
-.br
-#include <complex.h>
-
-.br
-.RI "success = \fBsolve_evp_complex_1stage\fP (\fBint\fP na, \fBint\fP nev, \fB double complex *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble complex*\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols);"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "int             \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "int             \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "double complex *\fBa\fP:             pointer to locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "int             \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "double         *\fBev\fP:            pointer to memory containing on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "double complex *\fBq\fP:             pointer to memory containing on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "int             \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "int             \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "int             \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "int             \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "int             \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-
-.RI "int             \fBsuccess\fP:       return value indicating success (1) or failure (0)
-
-.SH DESCRIPTION
-Solve the complex eigenvalue problem with the 1-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver.
-.br
-.SH "SEE ALSO"
-\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage\fP(3) \fBsolve_evp_real_2stage\fP(3) \fBsolve_evp_complex_2stage\fP(3) \fBprint_available_elpa2_kernels\fP(1)
diff --git a/man/solve_evp_complex_2stage.3 b/man/solve_evp_complex_2stage.3
deleted file mode 100644
index 71be1b5d6..000000000
--- a/man/solve_evp_complex_2stage.3
+++ /dev/null
@@ -1,91 +0,0 @@
-.TH "solve_evp_complex_2stage" 3 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*-
-.ad l
-.nh
-.SH NAME
-solve_evp_complex_2stage \- solve the complex eigenvalue problem with the 2-stage ELPA solver
-.br
-
-.SH SYNOPSIS
-.br
-.SS FORTRAN INTERFACE
-use elpa1
-use elpa2
-.br
-.br
-.RI  "success = \fBsolve_evp_real_2stage\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL)"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "integer,     intent(in)    \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "integer,     intent(in)    \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "complex*16,  intent(inout) \fBa\fP:             locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "integer,     intent(in)    \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "real*8,      intent(inout) \fBev\fP:            on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "complex*16,  intent(inout) \fBq\fP:             on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "integer,     intent(in)    \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "integer,     intent(in)    \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "integer,     intent(in)    \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "integer,     intent(in)    \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "integer,     intent(in)    \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "integer,     intent(in)    \fBmpi_comm_all\fP:  communicator for all processes in the processor set involved in ELPA"
-.br
-.RI "logical                    \fBsuccess\fP:       return value indicating success or failure"
-.br
-.SS C INTERFACE
-#include "elpa.h"
-.br
-#include <complex.h>
-
-.br
-.RI "success = \fBsolve_evp_complex_2stage\fP (\fBint\fP na, \fBint\fP nev, \fB double complex *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble complex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL);"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "int             \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "int             \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "double complex *\fBa\fP:             pointer to locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "int             \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "double         *\fBev\fP:            pointer to memory containing on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "double complex *\fBq\fP:             pointer to memory containing on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "int             \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "int             \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "int             \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "int             \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "int             \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "int             \fBmpi_comm_all\fP:  communicator for all processes in the processor set involved in ELPA"
-.br
-.RI "int             \fBsuccess\fP:       return value indicating success (1) or failure (0)
-
-.SH DESCRIPTION
-Solve the complex eigenvalue problem with the 2-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver.
-.br
-.SH "SEE ALSO"
-\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage\fP(3) \fBsolve_evp_complex_1stage\fP(3) \fBsolve_evp_real_2stage\fP(3) \fBprint_available_elpa2_kernels\fP(1)
diff --git a/man/solve_evp_real.3 b/man/solve_evp_real.3
deleted file mode 100644
index a6762cd20..000000000
--- a/man/solve_evp_real.3
+++ /dev/null
@@ -1,51 +0,0 @@
-.TH "solve_evp_real" 3 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*-
-.ad l
-.nh
-.SH NAME
-solve_evp_real \- solve the real eigenvalue problem with the 1-stage ELPA solver.
-This is an old and deprecated interface. It is recommendet to use \fBsolve_evp_real_1stage\fP(3)
-.br
-
-.SH SYNOPSIS
-.br
-.SS FORTRAN INTERFACE
-use elpa1
-.br
-.br
-.RI  "success = \fBsolve_evp_real\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "integer, intent(in)    \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "integer, intent(in)    \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "real*8,  intent(inout) \fBa\fP:             locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "integer, intent(in)    \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "real*8,  intent(inout) \fBev\fP:            on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "real*8,  intent(inout) \fBq\fP:             on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "integer, intent(in)    \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "integer, intent(in)    \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "integer, intent(in)    \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "integer, intent(in)    \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "integer, intent(in)    \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-
-.RI "logical                \fBsuccess\fP:       return value indicating success or failure"
-.br
-.SH DESCRIPTION
-Solve the real eigenvalue problem with the 1-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver.
-.br
-.SH "SEE ALSO"
-\fBget_elpa_communicators\fP(3) \fBsolve_evp_complex_1stage\fP(3) \fBsolve_evp_real_2stage\fP(3) \fBsolve_evp_complex_2stage\fP(3) \fBprint_available_elpa2_kernels\fP(1)
diff --git a/man/solve_evp_real_1stage.3 b/man/solve_evp_real_1stage.3
deleted file mode 100644
index aa7431d18..000000000
--- a/man/solve_evp_real_1stage.3
+++ /dev/null
@@ -1,86 +0,0 @@
-.TH "solve_evp_real_1stage" 3 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*-
-.ad l
-.nh
-.SH NAME
-solve_evp_real_1stage \- solve the real eigenvalue problem with the 1-stage ELPA solver
-.br
-
-.SH SYNOPSIS
-.br
-.SS FORTRAN INTERFACE
-use elpa1
-.br
-.br
-.RI  "success = \fBsolve_evp_real_1stage\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "integer, intent(in)    \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "integer, intent(in)    \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "real*8,  intent(inout) \fBa\fP:             locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "integer, intent(in)    \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "real*8,  intent(inout) \fBev\fP:            on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "real*8,  intent(inout) \fBq\fP:             on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "integer, intent(in)    \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "integer, intent(in)    \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "integer, intent(in)    \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "integer, intent(in)    \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "integer, intent(in)    \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-
-.RI "logical                \fBsuccess\fP:       return value indicating success or failure"
-.br
-.SS C INTERFACE
-#include "elpa.h"
-
-.br
-.RI "success = \fBsolve_evp_real_1stage\fP (\fBint\fP na, \fBint\fP nev, \fB double *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols);"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "int     \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "int     \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "double *\fBa\fP:             pointer to locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "int     \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "double *\fBev\fP:            pointer to memory containing on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "double *\fBq\fP:             pointer to memory containing on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "int     \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "int     \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "int     \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "int     \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "int     \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-
-.RI "int     \fBsuccess\fP:       return value indicating success (1) or failure (0)
-
-.SH DESCRIPTION
-Solve the real eigenvalue problem with the 1-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver.
-.br
-.SH "SEE ALSO"
-\fBget_elpa_communicators\fP(3) \fBsolve_evp_complex_1stage\fP(3) \fBsolve_evp_real_2stage\fP(3) \fBsolve_evp_complex_2stage\fP(3) \fBprint_available_elpa2_kernels\fP(1)
diff --git a/man/solve_evp_real_2stage.3 b/man/solve_evp_real_2stage.3
deleted file mode 100644
index 207216f35..000000000
--- a/man/solve_evp_real_2stage.3
+++ /dev/null
@@ -1,93 +0,0 @@
-.TH "solve_evp_real_2stage" 3 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*-
-.ad l
-.nh
-.SH NAME
-solve_evp_real_2stage \- solve the real eigenvalue problem with the 2-stage ELPA solver
-.br
-
-.SH SYNOPSIS
-.br
-.SS FORTRAN INTERFACE
-use elpa1
-use elpa2
-.br
-.br
-.RI  "success = \fBsolve_evp_real_2stage\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQr=useQR)"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "integer, intent(in)            \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "integer, intent(in)            \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "real*8,  intent(inout)         \fBa\fP:             locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "integer, intent(in)            \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "real*8,  intent(inout)         \fBev\fP:            on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "real*8,  intent(inout)         \fBq\fP:             on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "integer, intent(in)            \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "integer, intent(in)            \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "integer, intent(in)            \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "integer, intent(in)            \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "integer, intent(in)            \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "integer, intent(in)            \fBmpi_comm_all\fP:  communicator for all processes in the processor set involved in ELPA"
-.br
-.RI "logical, intent(in), optional: \fBuseQR\fP:         optional argument; switches to QR-decomposition if set to .true."
-
-.RI "logical                        \fBsuccess\fP:       return value indicating success or failure"
-.br
-.SS C INTERFACE
-#include "elpa.h"
-
-.br
-.RI "success = \fBsolve_evp_real_2stage\fP (\fBint\fP na, \fBint\fP nev, \fB double *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQr);"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "int     \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "int     \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "double *\fBa\fP:             pointer to locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "int     \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "double *\fBev\fP:            pointer to memory containing on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "double *\fBq\fP:             pointer to memory containing on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "int     \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "int     \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "int     \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "int     \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "int     \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "int     \fBmpi_comm_all\fP:  communicator for all processes in the processor set involved in ELPA"
-.br
-.RI "int     \fBuseQR\fP:         if set to 1 switch to QR-decomposition"
-
-.RI "int     \fBsuccess\fP:       return value indicating success (1) or failure (0)
-
-.SH DESCRIPTION
-Solve the real eigenvalue problem with the 2-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver.
-.br
-.SH "SEE ALSO"
-\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage\fP(3) \fBsolve_evp_complex_1stage\fP(3) \fBsolve_evp_complex_2stage\fP(3) \fBprint_available_elpa2_kernels\fP(1)
diff --git a/src/elpa1.F90 b/src/elpa1.F90
deleted file mode 100644
index e0292c22f..000000000
--- a/src/elpa1.F90
+++ /dev/null
@@ -1,476 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!    This particular source code file contains additions, changes and
-!    enhancements authored by Intel Corporation which is not part of
-!    the ELPA consortium.
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-!> \mainpage
-!> Eigenvalue SoLvers for Petaflop-Applications (ELPA)
-!> \par
-!> http://elpa.mpcdf.mpg.de
-!>
-!> \par
-!>    The ELPA library was originally created by the ELPA consortium,
-!>    consisting of the following organizations:
-!>
-!>    - Max Planck Computing and Data Facility (MPCDF) formerly known as
-!>      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!>    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!>      Informatik,
-!>    - Technische Universität München, Lehrstuhl für Informatik mit
-!>      Schwerpunkt Wissenschaftliches Rechnen ,
-!>    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!>    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!>      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!>      and
-!>    - IBM Deutschland GmbH
-!>
-!>   Some parts and enhancements of ELPA have been contributed and authored
-!>   by the Intel Corporation which is not part of the ELPA consortium.
-!>
-!>   Contributions to the ELPA source have been authored by (in alphabetical order):
-!>
-!> \author T. Auckenthaler, Volker Blum, A. Heinecke, L. Huedepohl, R. Johanni, Werner Jürgens, and A. Marek
-
-
-#include "config-f90.h"
-!> \brief Fortran module which provides the routines to use the one-stage ELPA solver
-module ELPA1
-  use precision
-  use elpa_utilities
-  use elpa1_compute
-
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
-  implicit none
-
-  PRIVATE ! By default, all routines contained are private
-
-  ! The following routines are public:
-
-  public :: get_elpa_row_col_comms     !< old, deprecated interface: Sets MPI row/col communicators
-  public :: get_elpa_communicators     !< Sets MPI row/col communicators
-
-  public :: solve_evp_real             !< old, deprecated interface: Driver routine for real eigenvalue problem
-  public :: solve_evp_real_1stage      !< Driver routine for real eigenvalue problem
-  public :: solve_evp_complex          !< old, deprecated interface:  Driver routine for complex eigenvalue problem
-  public :: solve_evp_complex_1stage   !< Driver routine for complex eigenvalue problem
-
-  ! Timing results, set by every call to solve_evp_xxx
-
-  real(kind=rk), public :: time_evp_fwd    !< time for forward transformations (to tridiagonal form)
-  real(kind=rk), public :: time_evp_solve  !< time for solving the tridiagonal system
-  real(kind=rk), public :: time_evp_back   !< time for back transformations of eigenvectors
-
-  logical, public :: elpa_print_times = .false. !< Set elpa_print_times to .true. for explicit timing outputs
-
-  include 'mpif.h'
-
-!> \brief get_elpa_row_col_comms:  old, deprecated Fortran function to create the MPI communicators for ELPA. Better use "elpa_get_communicators"
-!> \detail
-!> The interface and variable definition is the same as in "elpa_get_communicators"
-!> \param  mpi_comm_global   Global communicator for the calculations (in)
-!>
-!> \param  my_prow           Row coordinate of the calling process in the process grid (in)
-!>
-!> \param  my_pcol           Column coordinate of the calling process in the process grid (in)
-!>
-!> \param  mpi_comm_rows     Communicator for communicating within rows of processes (out)
-!>
-!> \param  mpi_comm_cols     Communicator for communicating within columns of processes (out)
-!> \result mpierr            integer error value of mpi_comm_split function
-  interface get_elpa_row_col_comms
-    module procedure get_elpa_communicators
-  end interface
-
-!> \brief solve_evp_real: old, deprecated Fortran function to solve the real eigenvalue problem with 1-stage solver. Better use "solve_evp_real_1stage"
-!>
-!> \detail
-!>  The interface and variable definition is the same as in "elpa_solve_evp_real_1stage"
-!  Parameters
-!
-!> \param  na                   Order of matrix a
-!>
-!> \param  nev                  Number of eigenvalues needed.
-!>                              The smallest nev eigenvalues/eigenvectors are calculated.
-!>
-!> \param  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
-!>                              Distribution is like in Scalapack.
-!>                              The full matrix must be set (not only one half like in scalapack).
-!>                              Destroyed on exit (upper and lower half).
-!>
-!>  \param lda                  Leading dimension of a
-!>
-!>  \param ev(na)               On output: eigenvalues of a, every processor gets the complete set
-!>
-!>  \param q(ldq,matrixCols)    On output: Eigenvectors of a
-!>                              Distribution is like in Scalapack.
-!>                              Must be always dimensioned to the full size (corresponding to (na,na))
-!>                              even if only a part of the eigenvalues is needed.
-!>
-!>  \param ldq                  Leading dimension of q
-!>
-!>  \param nblk                 blocksize of cyclic distribution, must be the same in both directions!
-!>
-!>  \param matrixCols           distributed number of matrix columns
-!>
-!>  \param mpi_comm_rows        MPI-Communicator for rows
-!>  \param mpi_comm_cols        MPI-Communicator for columns
-!>
-!>  \result                     success
-
-
-  interface solve_evp_real
-    module procedure solve_evp_real_1stage
-  end interface
-
-!> \brief solve_evp_complex: old, deprecated Fortran function to solve the complex eigenvalue problem with 1-stage solver. Better use "solve_evp_complex_1stage"
-!>
-!> \detail
-!> The interface and variable definition is the same as in "elpa_solve_evp_complex_1stage"
-!  Parameters
-!
-!> \param  na                   Order of matrix a
-!>
-!> \param  nev                  Number of eigenvalues needed.
-!>                              The smallest nev eigenvalues/eigenvectors are calculated.
-!>
-!> \param  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
-!>                              Distribution is like in Scalapack.
-!>                              The full matrix must be set (not only one half like in scalapack).
-!>                              Destroyed on exit (upper and lower half).
-!>
-!>  \param lda                  Leading dimension of a
-!>
-!>  \param ev(na)               On output: eigenvalues of a, every processor gets the complete set
-!>
-!>  \param q(ldq,matrixCols)    On output: Eigenvectors of a
-!>                              Distribution is like in Scalapack.
-!>                              Must be always dimensioned to the full size (corresponding to (na,na))
-!>                              even if only a part of the eigenvalues is needed.
-!>
-!>  \param ldq                  Leading dimension of q
-!>
-!>  \param nblk                 blocksize of cyclic distribution, must be the same in both directions!
-!>
-!>  \param matrixCols           distributed number of matrix columns
-!>
-!>  \param mpi_comm_rows        MPI-Communicator for rows
-!>  \param mpi_comm_cols        MPI-Communicator for columns
-!>
-!>  \result                     success
-
-
-  interface solve_evp_complex
-    module procedure solve_evp_complex_1stage
-  end interface
-
-contains
-
-!-------------------------------------------------------------------------------
-
-!> \brief Fortran function to create the MPI communicators for ELPA.
-! All ELPA routines need MPI communicators for communicating within
-! rows or columns of processes, these are set here.
-! mpi_comm_rows/mpi_comm_cols can be free'd with MPI_Comm_free if not used any more.
-!
-!  Parameters
-!
-!> \param  mpi_comm_global   Global communicator for the calculations (in)
-!>
-!> \param  my_prow           Row coordinate of the calling process in the process grid (in)
-!>
-!> \param  my_pcol           Column coordinate of the calling process in the process grid (in)
-!>
-!> \param  mpi_comm_rows     Communicator for communicating within rows of processes (out)
-!>
-!> \param  mpi_comm_cols     Communicator for communicating within columns of processes (out)
-!> \result mpierr            integer error value of mpi_comm_split function
-
-
-function get_elpa_communicators(mpi_comm_global, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols) result(mpierr)
-   use precision
-   implicit none
-
-   integer(kind=ik), intent(in)  :: mpi_comm_global, my_prow, my_pcol
-   integer(kind=ik), intent(out) :: mpi_comm_rows, mpi_comm_cols
-
-   integer(kind=ik)              :: mpierr
-
-   ! mpi_comm_rows is used for communicating WITHIN rows, i.e. all processes
-   ! having the same column coordinate share one mpi_comm_rows.
-   ! So the "color" for splitting is my_pcol and the "key" is my row coordinate.
-   ! Analogous for mpi_comm_cols
-
-   call mpi_comm_split(mpi_comm_global,my_pcol,my_prow,mpi_comm_rows,mpierr)
-   call mpi_comm_split(mpi_comm_global,my_prow,my_pcol,mpi_comm_cols,mpierr)
-
-end function get_elpa_communicators
-
-
-!> \brief solve_evp_real_1stage: Fortran function to solve the real eigenvalue problem with 1-stage solver
-!>
-!  Parameters
-!
-!> \param  na                   Order of matrix a
-!>
-!> \param  nev                  Number of eigenvalues needed.
-!>                              The smallest nev eigenvalues/eigenvectors are calculated.
-!>
-!> \param  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
-!>                              Distribution is like in Scalapack.
-!>                              The full matrix must be set (not only one half like in scalapack).
-!>                              Destroyed on exit (upper and lower half).
-!>
-!>  \param lda                  Leading dimension of a
-!>
-!>  \param ev(na)               On output: eigenvalues of a, every processor gets the complete set
-!>
-!>  \param q(ldq,matrixCols)    On output: Eigenvectors of a
-!>                              Distribution is like in Scalapack.
-!>                              Must be always dimensioned to the full size (corresponding to (na,na))
-!>                              even if only a part of the eigenvalues is needed.
-!>
-!>  \param ldq                  Leading dimension of q
-!>
-!>  \param nblk                 blocksize of cyclic distribution, must be the same in both directions!
-!>
-!>  \param matrixCols           distributed number of matrix columns
-!>
-!>  \param mpi_comm_rows        MPI-Communicator for rows
-!>  \param mpi_comm_cols        MPI-Communicator for columns
-!>
-!>  \result                     success
-
-
-function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) result(success)
-   use precision
-#ifdef HAVE_DETAILED_TIMINGS
-   use timings
-#endif
-   implicit none
-
-   integer(kind=ik), intent(in)  :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-   real(kind=rk)                 :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
-   ! was
-   ! real a(lda,*), q(ldq,*)
-
-   integer(kind=ik)              :: my_prow, my_pcol, mpierr
-   real(kind=rk), allocatable    :: e(:), tau(:)
-   real(kind=rk)                 :: ttt0, ttt1
-   logical                       :: success
-   logical, save                 :: firstCall = .true.
-   logical                       :: wantDebug
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("solve_evp_real_1stage")
-#endif
-
-   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-
-   success = .true.
-
-   wantDebug = .false.
-   if (firstCall) then
-     ! are debug messages desired?
-     wantDebug = debug_messages_via_environment_variable()
-     firstCall = .false.
-   endif
-
-   allocate(e(na), tau(na))
-
-   ttt0 = MPI_Wtime()
-   call tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
-   ttt1 = MPI_Wtime()
-   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0
-   time_evp_fwd = ttt1-ttt0
-
-   ttt0 = MPI_Wtime()
-   call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
-                    mpi_comm_cols, wantDebug, success)
-   if (.not.(success)) return
-
-   ttt1 = MPI_Wtime()
-   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time solve_tridi  :',ttt1-ttt0
-   time_evp_solve = ttt1-ttt0
-
-   ttt0 = MPI_Wtime()
-   call trans_ev_real(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
-   ttt1 = MPI_Wtime()
-   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_real:',ttt1-ttt0
-   time_evp_back = ttt1-ttt0
-
-   deallocate(e, tau)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("solve_evp_real_1stage")
-#endif
-
-end function solve_evp_real_1stage
-
-
-!> \brief solve_evp_complex_1stage: Fortran function to solve the complex eigenvalue problem with 1-stage solver
-!>
-!  Parameters
-!
-!> \param  na                   Order of matrix a
-!>
-!> \param  nev                  Number of eigenvalues needed.
-!>                              The smallest nev eigenvalues/eigenvectors are calculated.
-!>
-!> \param  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
-!>                              Distribution is like in Scalapack.
-!>                              The full matrix must be set (not only one half like in scalapack).
-!>                              Destroyed on exit (upper and lower half).
-!>
-!>  \param lda                  Leading dimension of a
-!>
-!>  \param ev(na)               On output: eigenvalues of a, every processor gets the complete set
-!>
-!>  \param q(ldq,matrixCols)    On output: Eigenvectors of a
-!>                              Distribution is like in Scalapack.
-!>                              Must be always dimensioned to the full size (corresponding to (na,na))
-!>                              even if only a part of the eigenvalues is needed.
-!>
-!>  \param ldq                  Leading dimension of q
-!>
-!>  \param nblk                 blocksize of cyclic distribution, must be the same in both directions!
-!>
-!>  \param matrixCols           distributed number of matrix columns
-!>
-!>  \param mpi_comm_rows        MPI-Communicator for rows
-!>  \param mpi_comm_cols        MPI-Communicator for columns
-!>
-!>  \result                     success
-
-function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) result(success)
-#ifdef HAVE_DETAILED_TIMINGS
-   use timings
-#endif
-   use precision
-   implicit none
-
-   integer(kind=ik), intent(in)     :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-   complex(kind=ck)                 :: a(lda,matrixCols), q(ldq,matrixCols)
-   ! was
-   ! complex a(lda,*), q(ldq,*)
-   real(kind=rk)                    :: ev(na)
-
-   integer(kind=ik)                 :: my_prow, my_pcol, np_rows, np_cols, mpierr
-   integer(kind=ik)                 :: l_rows, l_cols, l_cols_nev
-   real(kind=rk), allocatable       :: q_real(:,:), e(:)
-   complex(kind=ck), allocatable    :: tau(:)
-   real(kind=rk)                    :: ttt0, ttt1
-
-   logical                          :: success
-   logical, save                    :: firstCall = .true.
-   logical                          :: wantDebug
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("solve_evp_complex_1stage")
-#endif
-
-   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-   success = .true.
-
-   wantDebug = .false.
-   if (firstCall) then
-     ! are debug messages desired?
-     wantDebug = debug_messages_via_environment_variable()
-     firstCall = .false.
-   endif
-
-
-   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
-   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
-
-   l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev
-
-   allocate(e(na), tau(na))
-   allocate(q_real(l_rows,l_cols))
-
-   ttt0 = MPI_Wtime()
-   call tridiag_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
-   ttt1 = MPI_Wtime()
-   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_complex :',ttt1-ttt0
-   time_evp_fwd = ttt1-ttt0
-
-   ttt0 = MPI_Wtime()
-   call solve_tridi(na, nev, ev, e, q_real, l_rows, nblk, matrixCols, mpi_comm_rows, &
-                    mpi_comm_cols, wantDebug, success)
-   if (.not.(success)) return
-
-   ttt1 = MPI_Wtime()
-   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time solve_tridi     :',ttt1-ttt0
-   time_evp_solve = ttt1-ttt0
-
-   ttt0 = MPI_Wtime()
-   q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
-
-   call trans_ev_complex(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
-   ttt1 = MPI_Wtime()
-   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_complex:',ttt1-ttt0
-   time_evp_back = ttt1-ttt0
-
-   deallocate(q_real)
-   deallocate(e, tau)
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("solve_evp_complex_1stage")
-#endif
-
-end function solve_evp_complex_1stage
-
-
-
-end module ELPA1
diff --git a/src/elpa1_compute.F90 b/src/elpa1_compute.F90
deleted file mode 100644
index 84c35f7f5..000000000
--- a/src/elpa1_compute.F90
+++ /dev/null
@@ -1,4025 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!    This particular source code file contains additions, changes and
-!    enhancements authored by Intel Corporation which is not part of
-!    the ELPA consortium.
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-#include "config-f90.h"
-
-module ELPA1_compute
-  use elpa_utilities
-#ifdef HAVE_DETAILED_TIMINGS
-  use timings
-#endif
-  implicit none
-
-  PRIVATE ! set default to private
-
-  public :: tridiag_real               ! Transform real symmetric matrix to tridiagonal form
-  public :: trans_ev_real              ! Transform eigenvectors of a tridiagonal matrix back
-  public :: mult_at_b_real             ! Multiply real matrices A**T * B
-
-  public :: tridiag_complex            ! Transform complex hermitian matrix to tridiagonal form
-  public :: trans_ev_complex           ! Transform eigenvectors of a tridiagonal matrix back
-  public :: mult_ah_b_complex          ! Multiply complex matrices A**H * B
-
-  public :: solve_tridi                ! Solve tridiagonal eigensystem with divide and conquer method
-
-  public :: cholesky_real              ! Cholesky factorization of a real matrix
-  public :: invert_trm_real            ! Invert real triangular matrix
-
-  public :: cholesky_complex           ! Cholesky factorization of a complex matrix
-  public :: invert_trm_complex         ! Invert complex triangular matrix
-
-  public :: local_index                ! Get local index of a block cyclic distributed matrix
-  public :: least_common_multiple      ! Get least common multiple
-
-  public :: hh_transform_real
-  public :: hh_transform_complex
-
-  public :: elpa_reduce_add_vectors_complex, elpa_reduce_add_vectors_real
-  public :: elpa_transpose_vectors_complex, elpa_transpose_vectors_real
-
-  include 'mpif.h'
-
-  contains
-
-#define DATATYPE REAL(kind=rk)
-#define BYTESIZE 8
-#define REALCASE 1
-#include "elpa_transpose_vectors.X90"
-#include "elpa_reduce_add_vectors.X90"
-#undef DATATYPE
-#undef BYTESIZE
-#undef REALCASE
-
-    subroutine tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau)
-
-    !-------------------------------------------------------------------------------
-    !  tridiag_real: Reduces a distributed symmetric matrix to tridiagonal form
-    !                (like Scalapack Routine PDSYTRD)
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix
-    !
-    !  a(lda,matrixCols)    Distributed matrix which should be reduced.
-    !              Distribution is like in Scalapack.
-    !              Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
-    !              a(:,:) is overwritten on exit with the Householder vectors
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !  d(na)       Diagonal elements (returned), identical on all processors
-    !
-    !  e(na)       Off-Diagonal elements (returned), identical on all processors
-    !
-    !  tau(na)     Factors for the Householder vectors (returned), needed for back transformation
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)            :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-      real(kind=rk)               :: d(na), e(na), tau(na)
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk)               :: a(lda,*)
-#else
-      real(kind=rk)               :: a(lda,matrixCols)
-#endif
-
-      integer(kind=ik), parameter :: max_stored_rows = 32
-
-      integer(kind=ik)            :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)            :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
-      integer(kind=ik)            :: l_cols, l_rows, nstor
-      integer(kind=ik)            :: istep, i, j, lcs, lce, lrs, lre
-      integer(kind=ik)            :: tile_size, l_rows_tile, l_cols_tile
-
-#ifdef WITH_OPENMP
-      integer(kind=ik)            :: my_thread, n_threads, max_threads, n_iter
-      integer(kind=ik)            :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
-#endif
-
-      real(kind=rk)               :: vav, vnorm2, x, aux(2*max_stored_rows), aux1(2), aux2(2), vrl, xf
-
-      real(kind=rk), allocatable  :: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:)
-#ifdef WITH_OPENMP
-      real(kind=rk), allocatable  :: ur_p(:,:), uc_p(:,:)
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("tridiag_real")
-#endif
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
-
-      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
-      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
-
-      l_rows_tile = tile_size/np_rows ! local rows of a tile
-      l_cols_tile = tile_size/np_cols ! local cols of a tile
-
-
-      totalblocks = (na-1)/nblk + 1
-      max_blocks_row = (totalblocks-1)/np_rows + 1
-      max_blocks_col = (totalblocks-1)/np_cols + 1
-
-      max_local_rows = max_blocks_row*nblk
-      max_local_cols = max_blocks_col*nblk
-
-      allocate(tmp(MAX(max_local_rows,max_local_cols)))
-      allocate(vr(max_local_rows+1))
-      allocate(ur(max_local_rows))
-      allocate(vc(max_local_cols))
-      allocate(uc(max_local_cols))
-
-#ifdef WITH_OPENMP
-      max_threads = omp_get_max_threads()
-
-      allocate(ur_p(max_local_rows,0:max_threads-1))
-      allocate(uc_p(max_local_cols,0:max_threads-1))
-#endif
-
-      tmp = 0
-      vr = 0
-      ur = 0
-      vc = 0
-      uc = 0
-
-      allocate(vur(max_local_rows,2*max_stored_rows))
-      allocate(uvc(max_local_cols,2*max_stored_rows))
-
-      d(:) = 0
-      e(:) = 0
-      tau(:) = 0
-
-      nstor = 0
-
-      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
-      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
-      if(my_prow==prow(na, nblk, np_rows) .and. my_pcol==pcol(na, nblk, np_cols)) d(na) = a(l_rows,l_cols)
-
-      do istep=na,3,-1
-
-         ! Calculate number of local rows and columns of the still remaining matrix
-         ! on the local processor
-
-         l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
-         l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
-
-         ! Calculate vector for Householder transformation on all procs
-         ! owning column istep
-
-         if(my_pcol==pcol(istep, nblk, np_cols)) then
-
-            ! Get vector to be transformed; distribute last element and norm of
-            ! remaining elements to all procs in current column
-
-            vr(1:l_rows) = a(1:l_rows,l_cols+1)
-            if(nstor>0 .and. l_rows>0) then
-               call DGEMV('N',l_rows,2*nstor,1.d0,vur,ubound(vur,dim=1), &
-                          uvc(l_cols+1,1),ubound(uvc,dim=1),1.d0,vr,1)
-            endif
-
-            if(my_prow==prow(istep-1, nblk, np_rows)) then
-               aux1(1) = dot_product(vr(1:l_rows-1),vr(1:l_rows-1))
-               aux1(2) = vr(l_rows)
-            else
-               aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows))
-               aux1(2) = 0.
-            endif
-
-            call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-
-            vnorm2 = aux2(1)
-            vrl    = aux2(2)
-
-            ! Householder transformation
-
-            call hh_transform_real(vrl, vnorm2, xf, tau(istep))
-
-            ! Scale vr and store Householder vector for back transformation
-
-            vr(1:l_rows) = vr(1:l_rows) * xf
-            if(my_prow==prow(istep-1, nblk, np_rows)) then
-               vr(l_rows) = 1.
-               e(istep-1) = vrl
-            endif
-            a(1:l_rows,l_cols+1) = vr(1:l_rows) ! store Householder vector for back transformation
-
-         endif
-
-         ! Broadcast the Householder vector (and tau) along columns
-
-         if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
-         call MPI_Bcast(vr,l_rows+1,MPI_REAL8,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr)
-         tau(istep) =  vr(l_rows+1)
-
-         ! Transpose Householder vector vr -> vc
-
-         call elpa_transpose_vectors_real  (vr, ubound(vr,dim=1), mpi_comm_rows, &
-                                            vc, ubound(vc,dim=1), mpi_comm_cols, &
-                                            1, istep-1, 1, nblk)
-
-
-         ! Calculate u = (A + VU**T + UV**T)*v
-
-         ! For cache efficiency, we use only the upper half of the matrix tiles for this,
-         ! thus the result is partly in uc(:) and partly in ur(:)
-
-         uc(1:l_cols) = 0
-         ur(1:l_rows) = 0
-         if (l_rows>0 .and. l_cols>0) then
-
-#ifdef WITH_OPENMP
-
-#ifdef HAVE_DETAILED_TIMINGS
-           call timer%start("OpenMP parallel")
-#endif
-
-!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,lcs,lce,j,lrs,lre)
-
-           my_thread = omp_get_thread_num()
-           n_threads = omp_get_num_threads()
-
-           n_iter = 0
-
-           uc_p(1:l_cols,my_thread) = 0.
-           ur_p(1:l_rows,my_thread) = 0.
-#endif
-           do i=0,(istep-2)/tile_size
-             lcs = i*l_cols_tile+1
-             lce = min(l_cols,(i+1)*l_cols_tile)
-             if (lce<lcs) cycle
-             do j=0,i
-               lrs = j*l_rows_tile+1
-               lre = min(l_rows,(j+1)*l_rows_tile)
-               if (lre<lrs) cycle
-#ifdef WITH_OPENMP
-               if (mod(n_iter,n_threads) == my_thread) then
-                 call DGEMV('T',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vr(lrs),1,1.d0,uc_p(lcs,my_thread),1)
-                 if (i/=j) call DGEMV('N',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vc(lcs),1,1.d0,ur_p(lrs,my_thread),1)
-               endif
-               n_iter = n_iter+1
-#else
-               call DGEMV('T',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vr(lrs),1,1.d0,uc(lcs),1)
-               if (i/=j) call DGEMV('N',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vc(lcs),1,1.d0,ur(lrs),1)
-
-#endif
-             enddo
-           enddo
-#ifdef WITH_OPENMP
-!$OMP END PARALLEL
-#ifdef HAVE_DETAILED_TIMINGS
-           call timer%stop("OpenMP parallel")
-#endif
-
-           do i=0,max_threads-1
-             uc(1:l_cols) = uc(1:l_cols) + uc_p(1:l_cols,i)
-             ur(1:l_rows) = ur(1:l_rows) + ur_p(1:l_rows,i)
-           enddo
-#endif
-           if (nstor>0) then
-             call DGEMV('T',l_rows,2*nstor,1.d0,vur,ubound(vur,dim=1),vr,1,0.d0,aux,1)
-             call DGEMV('N',l_cols,2*nstor,1.d0,uvc,ubound(uvc,dim=1),aux,1,1.d0,uc,1)
-           endif
-
-         endif
-
-        ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
-        ! on the processors containing the diagonal
-        ! This is only necessary if ur has been calculated, i.e. if the
-        ! global tile size is smaller than the global remaining matrix
-
-        if (tile_size < istep-1) then
-          call elpa_reduce_add_vectors_REAL  (ur, ubound(ur,dim=1), mpi_comm_rows, &
-                                        uc, ubound(uc,dim=1), mpi_comm_cols, &
-                                        istep-1, 1, nblk)
-        endif
-
-        ! Sum up all the uc(:) parts, transpose uc -> ur
-
-        if (l_cols>0) then
-          tmp(1:l_cols) = uc(1:l_cols)
-          call mpi_allreduce(tmp,uc,l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-        endif
-
-        call elpa_transpose_vectors_real  (uc, ubound(uc,dim=1), mpi_comm_cols, &
-                                         ur, ubound(ur,dim=1), mpi_comm_rows, &
-                                         1, istep-1, 1, nblk)
-
-        ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
-
-        x = 0
-        if (l_cols>0) x = dot_product(vc(1:l_cols),uc(1:l_cols))
-        call mpi_allreduce(x,vav,1,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
-
-        ! store u and v in the matrices U and V
-        ! these matrices are stored combined in one here
-
-        do j=1,l_rows
-          vur(j,2*nstor+1) = tau(istep)*vr(j)
-          vur(j,2*nstor+2) = 0.5*tau(istep)*vav*vr(j) - ur(j)
-        enddo
-        do j=1,l_cols
-          uvc(j,2*nstor+1) = 0.5*tau(istep)*vav*vc(j) - uc(j)
-          uvc(j,2*nstor+2) = tau(istep)*vc(j)
-        enddo
-
-        nstor = nstor+1
-
-        ! If the limit of max_stored_rows is reached, calculate A + VU**T + UV**T
-
-        if (nstor==max_stored_rows .or. istep==3) then
-
-          do i=0,(istep-2)/tile_size
-            lcs = i*l_cols_tile+1
-            lce = min(l_cols,(i+1)*l_cols_tile)
-           lrs = 1
-            lre = min(l_rows,(i+1)*l_rows_tile)
-            if (lce<lcs .or. lre<lrs) cycle
-            call dgemm('N','T',lre-lrs+1,lce-lcs+1,2*nstor,1.d0, &
-                       vur(lrs,1),ubound(vur,dim=1),uvc(lcs,1),ubound(uvc,dim=1), &
-                       1.d0,a(lrs,lcs),lda)
-          enddo
-
-          nstor = 0
-
-        endif
-
-        if (my_prow==prow(istep-1, nblk, np_rows) .and. my_pcol==pcol(istep-1, nblk, np_cols)) then
-          if (nstor>0) a(l_rows,l_cols) = a(l_rows,l_cols) &
-                        + dot_product(vur(l_rows,1:2*nstor),uvc(l_cols,1:2*nstor))
-          d(istep-1) = a(l_rows,l_cols)
-        endif
-
-      enddo
-
-      ! Store e(1) and d(1)
-
-      if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) e(1) = a(1,l_cols) ! use last l_cols value of loop above
-      if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) d(1) = a(1,1)
-
-      deallocate(tmp, vr, ur, vc, uc, vur, uvc)
-
-      ! distribute the arrays d and e to all processors
-
-      allocate(tmp(na))
-      tmp = d
-      call mpi_allreduce(tmp,d,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-      tmp = d
-      call mpi_allreduce(tmp,d,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
-      tmp = e
-      call mpi_allreduce(tmp,e,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-      tmp = e
-      call mpi_allreduce(tmp,e,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
-      deallocate(tmp)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("tridiag_real")
-#endif
-
-    end subroutine tridiag_real
-
-    subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
-
-    !-------------------------------------------------------------------------------
-    !  trans_ev_real: Transforms the eigenvectors of a tridiagonal matrix back
-    !                 to the eigenvectors of the original matrix
-    !                 (like Scalapack Routine PDORMTR)
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix a, number of rows of matrix q
-    !
-    !  nqc         Number of columns of matrix q
-    !
-    !  a(lda,matrixCols)    Matrix containing the Householder vectors (i.e. matrix a after tridiag_real)
-    !              Distribution is like in Scalapack.
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a and q
-    !
-    !  tau(na)     Factors of the Householder vectors
-    !
-    !  q           On input: Eigenvectors of tridiagonal matrix
-    !              On output: Transformed eigenvectors
-    !              Distribution is like in Scalapack.
-    !
-    !  ldq         Leading dimension of q
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)           :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-      real(kind=rk)              :: tau(na)
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk)              :: a(lda,*), q(ldq,*)
-#else
-      real(kind=rk)              :: a(lda,matrixCols), q(ldq,matrixCols)
-#endif
-
-      integer(kind=ik)           :: max_stored_rows
-
-      integer(kind=ik)           :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)           :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
-      integer(kind=ik)           :: l_cols, l_rows, l_colh, nstor
-      integer(kind=ik)           :: istep, i, n, nc, ic, ics, ice, nb, cur_pcol
-
-      real(kind=rk), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
-      real(kind=rk), allocatable :: tmat(:,:), h1(:), h2(:)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("trans_ev_real")
-#endif
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-
-      totalblocks = (na-1)/nblk + 1
-      max_blocks_row = (totalblocks-1)/np_rows + 1
-      max_blocks_col = ((nqc-1)/nblk)/np_cols + 1  ! Columns of q!
-
-      max_local_rows = max_blocks_row*nblk
-      max_local_cols = max_blocks_col*nblk
-
-      max_stored_rows = (63/nblk+1)*nblk
-
-      allocate(tmat(max_stored_rows,max_stored_rows))
-      allocate(h1(max_stored_rows*max_stored_rows))
-      allocate(h2(max_stored_rows*max_stored_rows))
-      allocate(tmp1(max_local_cols*max_stored_rows))
-      allocate(tmp2(max_local_cols*max_stored_rows))
-      allocate(hvb(max_local_rows*nblk))
-      allocate(hvm(max_local_rows,max_stored_rows))
-
-      hvm = 0   ! Must be set to 0 !!!
-      hvb = 0   ! Safety only
-
-      l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
-
-      nstor = 0
-
-      do istep=1,na,nblk
-
-        ics = MAX(istep,3)
-        ice = MIN(istep+nblk-1,na)
-        if (ice<ics) cycle
-
-        cur_pcol = pcol(istep, nblk, np_cols)
-
-        nb = 0
-        do ic=ics,ice
-
-          l_colh = local_index(ic  , my_pcol, np_cols, nblk, -1) ! Column of Householder vector
-          l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
-
-
-          if (my_pcol==cur_pcol) then
-            hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)
-            if (my_prow==prow(ic-1, nblk, np_rows)) then
-              hvb(nb+l_rows) = 1.
-            endif
-          endif
-
-          nb = nb+l_rows
-        enddo
-
-        if (nb>0) &
-            call MPI_Bcast(hvb,nb,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
-
-        nb = 0
-        do ic=ics,ice
-          l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
-          hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows)
-          nstor = nstor+1
-          nb = nb+l_rows
-        enddo
-
-        ! Please note: for smaller matix sizes (na/np_rows<=256), a value of 32 for nstor is enough!
-        if (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32)) then
-
-          ! Calculate scalar products of stored vectors.
-          ! This can be done in different ways, we use dsyrk
-
-          tmat = 0
-          if (l_rows>0) &
-               call dsyrk('U','T',nstor,l_rows,1.d0,hvm,ubound(hvm,dim=1),0.d0,tmat,max_stored_rows)
-
-          nc = 0
-          do n=1,nstor-1
-            h1(nc+1:nc+n) = tmat(1:n,n+1)
-            nc = nc+n
-          enddo
-
-          if (nc>0) call mpi_allreduce(h1,h2,nc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-
-          ! Calculate triangular matrix T
-
-          nc = 0
-          tmat(1,1) = tau(ice-nstor+1)
-          do n=1,nstor-1
-            call dtrmv('L','T','N',n,tmat,max_stored_rows,h2(nc+1),1)
-            tmat(n+1,1:n) = -h2(nc+1:nc+n)*tau(ice-nstor+n+1)
-            tmat(n+1,n+1) = tau(ice-nstor+n+1)
-            nc = nc+n
-          enddo
-
-          ! Q = Q - V * T * V**T * Q
-
-          if (l_rows>0) then
-            call dgemm('T','N',nstor,l_cols,l_rows,1.d0,hvm,ubound(hvm,dim=1), &
-                          q,ldq,0.d0,tmp1,nstor)
-          else
-            tmp1(1:l_cols*nstor) = 0
-          endif
-          call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-          if (l_rows>0) then
-            call dtrmm('L','L','N','N',nstor,l_cols,1.0d0,tmat,max_stored_rows,tmp2,nstor)
-            call dgemm('N','N',l_rows,l_cols,nstor,-1.d0,hvm,ubound(hvm,dim=1), &
-                          tmp2,nstor,1.d0,q,ldq)
-          endif
-          nstor = 0
-        endif
-
-      enddo
-
-      deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("trans_ev_real")
-#endif
-
-    end subroutine trans_ev_real
-
-    subroutine mult_at_b_real(uplo_a, uplo_c, na, ncb, a, lda, b, ldb, nblk, mpi_comm_rows, mpi_comm_cols, c, ldc)
-
-    !-------------------------------------------------------------------------------
-    !  mult_at_b_real:  Performs C := A**T * B
-    !
-    !      where:  A is a square matrix (na,na) which is optionally upper or lower triangular
-    !              B is a (na,ncb) matrix
-    !              C is a (na,ncb) matrix where optionally only the upper or lower
-    !              triangle may be computed
-    !
-    !  Parameters
-    !
-    !  uplo_a      'U' if A is upper triangular
-    !              'L' if A is lower triangular
-    !              anything else if A is a full matrix
-    !              Please note: This pertains to the original A (as set in the calling program)
-    !              whereas the transpose of A is used for calculations
-    !              If uplo_a is 'U' or 'L', the other triangle is not used at all,
-    !              i.e. it may contain arbitrary numbers
-    !
-    !  uplo_c      'U' if only the upper diagonal part of C is needed
-    !              'L' if only the upper diagonal part of C is needed
-    !              anything else if the full matrix C is needed
-    !              Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
-    !              written to a certain extent, i.e. one shouldn't rely on the content there!
-    !
-    !  na          Number of rows/columns of A, number of rows of B and C
-    !
-    !  ncb         Number of columns  of B and C
-    !
-    !  a           Matrix A
-    !
-    !  lda         Leading dimension of a
-    !
-    !  b           Matrix B
-    !
-    !  ldb         Leading dimension of b
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !  c           Matrix C
-    !
-    !  ldc         Leading dimension of c
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      character*1                   :: uplo_a, uplo_c
-
-      integer(kind=ik)              :: na, ncb, lda, ldb, nblk, mpi_comm_rows, mpi_comm_cols, ldc
-      real(kind=rk)                 :: a(lda,*), b(ldb,*), c(ldc,*) ! remove assumed size!
-
-      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)              :: l_cols, l_rows, l_rows_np
-      integer(kind=ik)              :: np, n, nb, nblk_mult, lrs, lre, lcs, lce
-      integer(kind=ik)              :: gcol_min, gcol, goff
-      integer(kind=ik)              :: nstor, nr_done, noff, np_bc, n_aux_bc, nvals
-      integer(kind=ik), allocatable :: lrs_save(:), lre_save(:)
-
-      logical                       :: a_lower, a_upper, c_lower, c_upper
-
-      real(kind=rk), allocatable    :: aux_mat(:,:), aux_bc(:), tmp1(:,:), tmp2(:,:)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("mult_at_b_real")
-#endif
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-      l_rows = local_index(na,  my_prow, np_rows, nblk, -1) ! Local rows of a and b
-      l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b
-
-      ! Block factor for matrix multiplications, must be a multiple of nblk
-
-      if (na/np_rows<=256) then
-         nblk_mult = (31/nblk+1)*nblk
-      else
-         nblk_mult = (63/nblk+1)*nblk
-      endif
-
-      allocate(aux_mat(l_rows,nblk_mult))
-      allocate(aux_bc(l_rows*nblk))
-      allocate(lrs_save(nblk))
-      allocate(lre_save(nblk))
-
-      a_lower = .false.
-      a_upper = .false.
-      c_lower = .false.
-      c_upper = .false.
-
-      if (uplo_a=='u' .or. uplo_a=='U') a_upper = .true.
-      if (uplo_a=='l' .or. uplo_a=='L') a_lower = .true.
-      if (uplo_c=='u' .or. uplo_c=='U') c_upper = .true.
-      if (uplo_c=='l' .or. uplo_c=='L') c_lower = .true.
-
-      ! Build up the result matrix by processor rows
-
-      do np = 0, np_rows-1
-
-        ! In this turn, procs of row np assemble the result
-
-        l_rows_np = local_index(na, np, np_rows, nblk, -1) ! local rows on receiving processors
-
-        nr_done = 0 ! Number of rows done
-        aux_mat = 0
-        nstor = 0   ! Number of columns stored in aux_mat
-
-        ! Loop over the blocks on row np
-
-        do nb=0,(l_rows_np-1)/nblk
-
-          goff  = nb*np_rows + np ! Global offset in blocks corresponding to nb
-
-          ! Get the processor column which owns this block (A is transposed, so we need the column)
-          ! and the offset in blocks within this column.
-          ! The corresponding block column in A is then broadcast to all for multiplication with B
-
-          np_bc = MOD(goff,np_cols)
-          noff = goff/np_cols
-          n_aux_bc = 0
-
-          ! Gather up the complete block column of A on the owner
-
-          do n = 1, min(l_rows_np-nb*nblk,nblk) ! Loop over columns to be broadcast
-
-            gcol = goff*nblk + n ! global column corresponding to n
-            if (nstor==0 .and. n==1) gcol_min = gcol
-
-            lrs = 1       ! 1st local row number for broadcast
-            lre = l_rows  ! last local row number for broadcast
-            if (a_lower) lrs = local_index(gcol, my_prow, np_rows, nblk, +1)
-            if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
-
-            if (lrs<=lre) then
-              nvals = lre-lrs+1
-              if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n)
-              n_aux_bc = n_aux_bc + nvals
-            endif
-
-            lrs_save(n) = lrs
-            lre_save(n) = lre
-
-          enddo
-
-          ! Broadcast block column
-
-          call MPI_Bcast(aux_bc,n_aux_bc,MPI_REAL8,np_bc,mpi_comm_cols,mpierr)
-
-          ! Insert what we got in aux_mat
-
-          n_aux_bc = 0
-          do n = 1, min(l_rows_np-nb*nblk,nblk)
-            nstor = nstor+1
-            lrs = lrs_save(n)
-            lre = lre_save(n)
-            if (lrs<=lre) then
-              nvals = lre-lrs+1
-              aux_mat(lrs:lre,nstor) = aux_bc(n_aux_bc+1:n_aux_bc+nvals)
-              n_aux_bc = n_aux_bc + nvals
-            endif
-          enddo
-
-          ! If we got nblk_mult columns in aux_mat or this is the last block
-          ! do the matrix multiplication
-
-          if (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np) then
-
-            lrs = 1       ! 1st local row number for multiply
-            lre = l_rows  ! last local row number for multiply
-            if (a_lower) lrs = local_index(gcol_min, my_prow, np_rows, nblk, +1)
-            if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
-
-            lcs = 1       ! 1st local col number for multiply
-            lce = l_cols  ! last local col number for multiply
-            if (c_upper) lcs = local_index(gcol_min, my_pcol, np_cols, nblk, +1)
-            if (c_lower) lce = MIN(local_index(gcol, my_pcol, np_cols, nblk, -1),l_cols)
-
-            if (lcs<=lce) then
-              allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce))
-              if (lrs<=lre) then
-                call dgemm('T','N',nstor,lce-lcs+1,lre-lrs+1,1.d0,aux_mat(lrs,1),ubound(aux_mat,dim=1), &
-                             b(lrs,lcs),ldb,0.d0,tmp1,nstor)
-              else
-                tmp1 = 0
-              endif
-
-              ! Sum up the results and send to processor row np
-              call mpi_reduce(tmp1,tmp2,nstor*(lce-lcs+1),MPI_REAL8,MPI_SUM,np,mpi_comm_rows,mpierr)
-
-              ! Put the result into C
-              if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
-
-              deallocate(tmp1,tmp2)
-            endif
-
-            nr_done = nr_done+nstor
-            nstor=0
-            aux_mat(:,:)=0
-          endif
-        enddo
-      enddo
-
-      deallocate(aux_mat, aux_bc, lrs_save, lre_save)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("mult_at_b_real")
-#endif
-
-    end subroutine mult_at_b_real
-
-#define DATATYPE COMPLEX(kind=ck)
-#define BYTESIZE 16
-#define COMPLEXCASE 1
-#include "elpa_transpose_vectors.X90"
-#include "elpa_reduce_add_vectors.X90"
-#undef DATATYPE
-#undef BYTESIZE
-#undef COMPLEXCASE
-
-    subroutine tridiag_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau)
-
-    !-------------------------------------------------------------------------------
-    !  tridiag_complex: Reduces a distributed hermitian matrix to tridiagonal form
-    !                   (like Scalapack Routine PZHETRD)
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix
-    !
-    !  a(lda,matrixCols)    Distributed matrix which should be reduced.
-    !              Distribution is like in Scalapack.
-    !              Opposed to PZHETRD, a(:,:) must be set completely (upper and lower half)
-    !              a(:,:) is overwritten on exit with the Householder vectors
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !  d(na)       Diagonal elements (returned), identical on all processors
-    !
-    !  e(na)       Off-Diagonal elements (returned), identical on all processors
-    !
-    !  tau(na)     Factors for the Householder vectors (returned), needed for back transformation
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)              :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-      complex(kind=ck)              :: tau(na)
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      complex(kind=ck)              :: a(lda,*)
-#else
-      complex(kind=ck)              :: a(lda,matrixCols)
-#endif
-      real(kind=rk)                 :: d(na), e(na)
-
-      integer(kind=ik), parameter   :: max_stored_rows = 32
-
-      complex(kind=ck), parameter   :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
-
-      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)              :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
-      integer(kind=ik)              :: l_cols, l_rows, nstor
-      integer(kind=ik)              :: istep, i, j, lcs, lce, lrs, lre
-      integer(kind=ik)              :: tile_size, l_rows_tile, l_cols_tile
-
-#ifdef WITH_OPENMP
-      integer(kind=ik)              :: my_thread, n_threads, max_threads, n_iter
-      integer(kind=ik)              :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
-#endif
-
-      real(kind=rk)                 :: vnorm2
-      complex(kind=ck)              :: vav, xc, aux(2*max_stored_rows),  aux1(2), aux2(2), vrl, xf
-
-      complex(kind=ck), allocatable :: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:)
-#ifdef WITH_OPENMP
-      complex(kind=ck), allocatable :: ur_p(:,:), uc_p(:,:)
-#endif
-      real(kind=rk), allocatable    :: tmpr(:)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("tridiag_complex")
-#endif
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
-
-      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
-      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
-
-      l_rows_tile = tile_size/np_rows ! local rows of a tile
-      l_cols_tile = tile_size/np_cols ! local cols of a tile
-
-
-      totalblocks = (na-1)/nblk + 1
-      max_blocks_row = (totalblocks-1)/np_rows + 1
-      max_blocks_col = (totalblocks-1)/np_cols + 1
-
-      max_local_rows = max_blocks_row*nblk
-      max_local_cols = max_blocks_col*nblk
-
-      allocate(tmp(MAX(max_local_rows,max_local_cols)))
-      allocate(vr(max_local_rows+1))
-      allocate(ur(max_local_rows))
-      allocate(vc(max_local_cols))
-      allocate(uc(max_local_cols))
-
-#ifdef WITH_OPENMP
-      max_threads = omp_get_max_threads()
-
-      allocate(ur_p(max_local_rows,0:max_threads-1))
-      allocate(uc_p(max_local_cols,0:max_threads-1))
-#endif
-
-      tmp = 0
-      vr = 0
-      ur = 0
-      vc = 0
-      uc = 0
-
-      allocate(vur(max_local_rows,2*max_stored_rows))
-      allocate(uvc(max_local_cols,2*max_stored_rows))
-
-      d(:) = 0
-      e(:) = 0
-      tau(:) = 0
-
-      nstor = 0
-
-      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
-      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
-      if (my_prow==prow(na, nblk, np_rows) .and. my_pcol==pcol(na, nblk, np_cols)) d(na) = a(l_rows,l_cols)
-
-      do istep=na,3,-1
-
-        ! Calculate number of local rows and columns of the still remaining matrix
-        ! on the local processor
-
-        l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
-        l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
-
-        ! Calculate vector for Householder transformation on all procs
-        ! owning column istep
-
-        if (my_pcol==pcol(istep, nblk, np_cols)) then
-
-          ! Get vector to be transformed; distribute last element and norm of
-          ! remaining elements to all procs in current column
-
-          vr(1:l_rows) = a(1:l_rows,l_cols+1)
-          if (nstor>0 .and. l_rows>0) then
-            aux(1:2*nstor) = conjg(uvc(l_cols+1,1:2*nstor))
-            call ZGEMV('N',l_rows,2*nstor,CONE,vur,ubound(vur,dim=1), &
-                        aux,1,CONE,vr,1)
-          endif
-
-          if (my_prow==prow(istep-1, nblk, np_rows)) then
-            aux1(1) = dot_product(vr(1:l_rows-1),vr(1:l_rows-1))
-            aux1(2) = vr(l_rows)
-          else
-            aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows))
-            aux1(2) = 0.
-          endif
-
-          call mpi_allreduce(aux1,aux2,2,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
-
-          vnorm2 = aux2(1)
-          vrl    = aux2(2)
-
-          ! Householder transformation
-
-          call hh_transform_complex(vrl, vnorm2, xf, tau(istep))
-
-          ! Scale vr and store Householder vector for back transformation
-
-          vr(1:l_rows) = vr(1:l_rows) * xf
-          if (my_prow==prow(istep-1, nblk, np_rows)) then
-            vr(l_rows) = 1.
-            e(istep-1) = vrl
-          endif
-          a(1:l_rows,l_cols+1) = vr(1:l_rows) ! store Householder vector for back transformation
-
-        endif
-
-        ! Broadcast the Householder vector (and tau) along columns
-
-        if (my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
-        call MPI_Bcast(vr,l_rows+1,MPI_DOUBLE_COMPLEX,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr)
-        tau(istep) =  vr(l_rows+1)
-
-        ! Transpose Householder vector vr -> vc
-
-!        call elpa_transpose_vectors  (vr, 2*ubound(vr,dim=1), mpi_comm_rows, &
-!                                      vc, 2*ubound(vc,dim=1), mpi_comm_cols, &
-!                                      1, 2*(istep-1), 1, 2*nblk)
-
-        call elpa_transpose_vectors_complex  (vr, ubound(vr,dim=1), mpi_comm_rows, &
-                                              vc, ubound(vc,dim=1), mpi_comm_cols, &
-                                              1, (istep-1), 1, nblk)
-        ! Calculate u = (A + VU**T + UV**T)*v
-
-        ! For cache efficiency, we use only the upper half of the matrix tiles for this,
-        ! thus the result is partly in uc(:) and partly in ur(:)
-
-        uc(1:l_cols) = 0
-        ur(1:l_rows) = 0
-        if (l_rows>0 .and. l_cols>0) then
-
-#ifdef WITH_OPENMP
-
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%start("OpenMP parallel")
-#endif
-
-!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,lcs,lce,j,lrs,lre)
-
-          my_thread = omp_get_thread_num()
-          n_threads = omp_get_num_threads()
-
-          n_iter = 0
-
-          uc_p(1:l_cols,my_thread) = 0.
-          ur_p(1:l_rows,my_thread) = 0.
-#endif
-
-          do i=0,(istep-2)/tile_size
-            lcs = i*l_cols_tile+1
-            lce = min(l_cols,(i+1)*l_cols_tile)
-            if (lce<lcs) cycle
-            do j=0,i
-              lrs = j*l_rows_tile+1
-              lre = min(l_rows,(j+1)*l_rows_tile)
-              if (lre<lrs) cycle
-#ifdef WITH_OPENMP
-              if (mod(n_iter,n_threads) == my_thread) then
-                call ZGEMV('C',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vr(lrs),1,CONE,uc_p(lcs,my_thread),1)
-                if (i/=j) call ZGEMV('N',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vc(lcs),1,CONE,ur_p(lrs,my_thread),1)
-              endif
-              n_iter = n_iter+1
-#else
-              call ZGEMV('C',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vr(lrs),1,CONE,uc(lcs),1)
-              if (i/=j) call ZGEMV('N',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vc(lcs),1,CONE,ur(lrs),1)
-#endif
-            enddo
-          enddo
-
-#ifdef WITH_OPENMP
-!$OMP END PARALLEL
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%stop("OpenMP parallel")
-#endif
-
-          do i=0,max_threads-1
-            uc(1:l_cols) = uc(1:l_cols) + uc_p(1:l_cols,i)
-            ur(1:l_rows) = ur(1:l_rows) + ur_p(1:l_rows,i)
-          enddo
-#endif
-
-          if (nstor>0) then
-            call ZGEMV('C',l_rows,2*nstor,CONE,vur,ubound(vur,dim=1),vr,1,CZERO,aux,1)
-            call ZGEMV('N',l_cols,2*nstor,CONE,uvc,ubound(uvc,dim=1),aux,1,CONE,uc,1)
-          endif
-
-        endif
-
-        ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
-        ! on the processors containing the diagonal
-        ! This is only necessary if ur has been calculated, i.e. if the
-        ! global tile size is smaller than the global remaining matrix
-
-        if (tile_size < istep-1) then
-          call elpa_reduce_add_vectors_COMPLEX  (ur, ubound(ur,dim=1), mpi_comm_rows, &
-                                          uc, ubound(uc,dim=1), mpi_comm_cols, &
-                                          (istep-1), 1, nblk)
-        endif
-
-        ! Sum up all the uc(:) parts, transpose uc -> ur
-
-        if (l_cols>0) then
-          tmp(1:l_cols) = uc(1:l_cols)
-          call mpi_allreduce(tmp,uc,l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
-        endif
-
-!        call elpa_transpose_vectors  (uc, 2*ubound(uc,dim=1), mpi_comm_cols, &
-!                                      ur, 2*ubound(ur,dim=1), mpi_comm_rows, &
-!                                      1, 2*(istep-1), 1, 2*nblk)
-
-        call elpa_transpose_vectors_complex  (uc, ubound(uc,dim=1), mpi_comm_cols, &
-                                              ur, ubound(ur,dim=1), mpi_comm_rows, &
-                                              1, (istep-1), 1, nblk)
-
-
-
-        ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
-
-        xc = 0
-        if (l_cols>0) xc = dot_product(vc(1:l_cols),uc(1:l_cols))
-        call mpi_allreduce(xc,vav,1,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_cols,mpierr)
-
-        ! store u and v in the matrices U and V
-        ! these matrices are stored combined in one here
-
-        do j=1,l_rows
-          vur(j,2*nstor+1) = conjg(tau(istep))*vr(j)
-          vur(j,2*nstor+2) = 0.5*conjg(tau(istep))*vav*vr(j) - ur(j)
-        enddo
-        do j=1,l_cols
-          uvc(j,2*nstor+1) = 0.5*conjg(tau(istep))*vav*vc(j) - uc(j)
-          uvc(j,2*nstor+2) = conjg(tau(istep))*vc(j)
-        enddo
-
-        nstor = nstor+1
-
-        ! If the limit of max_stored_rows is reached, calculate A + VU**T + UV**T
-
-        if (nstor==max_stored_rows .or. istep==3) then
-
-          do i=0,(istep-2)/tile_size
-            lcs = i*l_cols_tile+1
-            lce = min(l_cols,(i+1)*l_cols_tile)
-            lrs = 1
-            lre = min(l_rows,(i+1)*l_rows_tile)
-            if (lce<lcs .or. lre<lrs) cycle
-            call ZGEMM('N','C',lre-lrs+1,lce-lcs+1,2*nstor,CONE, &
-                         vur(lrs,1),ubound(vur,dim=1),uvc(lcs,1),ubound(uvc,dim=1), &
-                         CONE,a(lrs,lcs),lda)
-          enddo
-
-          nstor = 0
-
-        endif
-
-        if (my_prow==prow(istep-1, nblk, np_rows) .and. my_pcol==pcol(istep-1, nblk, np_cols)) then
-          if (nstor>0) a(l_rows,l_cols) = a(l_rows,l_cols) &
-                          + dot_product(vur(l_rows,1:2*nstor),uvc(l_cols,1:2*nstor))
-          d(istep-1) = a(l_rows,l_cols)
-        endif
-
-      enddo ! istep
-
-      ! Store e(1) and d(1)
-
-      if (my_pcol==pcol(2, nblk, np_cols)) then
-        if (my_prow==prow(1, nblk, np_rows)) then
-          ! We use last l_cols value of loop above
-          vrl = a(1,l_cols)
-          call hh_transform_complex(vrl, 0.d0, xf, tau(2))
-          e(1) = vrl
-          a(1,l_cols) = 1. ! for consistency only
-        endif
-        call mpi_bcast(tau(2),1,MPI_DOUBLE_COMPLEX,prow(1, nblk, np_rows),mpi_comm_rows,mpierr)
-      endif
-      call mpi_bcast(tau(2),1,MPI_DOUBLE_COMPLEX,pcol(2, nblk, np_cols),mpi_comm_cols,mpierr)
-
-      if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) d(1) = a(1,1)
-
-      deallocate(tmp, vr, ur, vc, uc, vur, uvc)
-
-      ! distribute the arrays d and e to all processors
-
-      allocate(tmpr(na))
-      tmpr = d
-      call mpi_allreduce(tmpr,d,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-      tmpr = d
-      call mpi_allreduce(tmpr,d,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
-      tmpr = e
-      call mpi_allreduce(tmpr,e,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-      tmpr = e
-      call mpi_allreduce(tmpr,e,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
-      deallocate(tmpr)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("tridiag_complex")
-#endif
-
-    end subroutine tridiag_complex
-
-    subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
-
-    !-------------------------------------------------------------------------------
-    !  trans_ev_complex: Transforms the eigenvectors of a tridiagonal matrix back
-    !                    to the eigenvectors of the original matrix
-    !                    (like Scalapack Routine PZUNMTR)
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix a, number of rows of matrix q
-    !
-    !  nqc         Number of columns of matrix q
-    !
-    !  a(lda,matrixCols)    Matrix containing the Householder vectors (i.e. matrix a after tridiag_complex)
-    !              Distribution is like in Scalapack.
-    !
-    !  lda         Leading dimension of a
-    !
-    !  tau(na)     Factors of the Householder vectors
-    !
-    !  q           On input: Eigenvectors of tridiagonal matrix
-    !              On output: Transformed eigenvectors
-    !              Distribution is like in Scalapack.
-    !
-    !  ldq         Leading dimension of q
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)              ::  na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-      complex(kind=ck)              ::  tau(na)
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      complex(kind=ck)              :: a(lda,*), q(ldq,*)
-#else
-      complex(kind=ck)              ::  a(lda,matrixCols), q(ldq,matrixCols)
-#endif
-      integer(kind=ik)              :: max_stored_rows
-
-      complex(kind=ck), parameter   :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
-
-      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)              :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
-      integer(kind=ik)              :: l_cols, l_rows, l_colh, nstor
-      integer(kind=ik)              :: istep, i, n, nc, ic, ics, ice, nb, cur_pcol
-
-      complex(kind=ck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
-      complex(kind=ck), allocatable :: tmat(:,:), h1(:), h2(:)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("trans_ev_complex")
-#endif
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-
-      totalblocks = (na-1)/nblk + 1
-      max_blocks_row = (totalblocks-1)/np_rows + 1
-      max_blocks_col = ((nqc-1)/nblk)/np_cols + 1  ! Columns of q!
-
-      max_local_rows = max_blocks_row*nblk
-      max_local_cols = max_blocks_col*nblk
-
-      max_stored_rows = (63/nblk+1)*nblk
-
-      allocate(tmat(max_stored_rows,max_stored_rows))
-      allocate(h1(max_stored_rows*max_stored_rows))
-      allocate(h2(max_stored_rows*max_stored_rows))
-      allocate(tmp1(max_local_cols*max_stored_rows))
-      allocate(tmp2(max_local_cols*max_stored_rows))
-      allocate(hvb(max_local_rows*nblk))
-      allocate(hvm(max_local_rows,max_stored_rows))
-
-      hvm = 0   ! Must be set to 0 !!!
-      hvb = 0   ! Safety only
-
-      l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
-
-      nstor = 0
-
-      ! In the complex case tau(2) /= 0
-      if (my_prow == prow(1, nblk, np_rows)) then
-        q(1,1:l_cols) = q(1,1:l_cols)*((1.d0,0.d0)-tau(2))
-      endif
-
-      do istep=1,na,nblk
-
-        ics = MAX(istep,3)
-        ice = MIN(istep+nblk-1,na)
-        if (ice<ics) cycle
-
-        cur_pcol = pcol(istep, nblk, np_cols)
-
-        nb = 0
-        do ic=ics,ice
-
-          l_colh = local_index(ic  , my_pcol, np_cols, nblk, -1) ! Column of Householder vector
-          l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
-
-
-          if (my_pcol==cur_pcol) then
-            hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)
-            if (my_prow==prow(ic-1, nblk, np_rows)) then
-              hvb(nb+l_rows) = 1.
-            endif
-          endif
-
-          nb = nb+l_rows
-        enddo
-
-        if (nb>0) &
-           call MPI_Bcast(hvb,nb,MPI_DOUBLE_COMPLEX,cur_pcol,mpi_comm_cols,mpierr)
-
-        nb = 0
-        do ic=ics,ice
-          l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
-          hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows)
-          nstor = nstor+1
-          nb = nb+l_rows
-        enddo
-
-        ! Please note: for smaller matix sizes (na/np_rows<=256), a value of 32 for nstor is enough!
-        if (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32)) then
-
-          ! Calculate scalar products of stored vectors.
-          ! This can be done in different ways, we use zherk
-
-          tmat = 0
-          if (l_rows>0) &
-             call zherk('U','C',nstor,l_rows,CONE,hvm,ubound(hvm,dim=1),CZERO,tmat,max_stored_rows)
-
-          nc = 0
-          do n=1,nstor-1
-            h1(nc+1:nc+n) = tmat(1:n,n+1)
-            nc = nc+n
-          enddo
-
-          if (nc>0) call mpi_allreduce(h1,h2,nc,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
-
-          ! Calculate triangular matrix T
-
-          nc = 0
-          tmat(1,1) = tau(ice-nstor+1)
-          do n=1,nstor-1
-            call ztrmv('L','C','N',n,tmat,max_stored_rows,h2(nc+1),1)
-            tmat(n+1,1:n) = -conjg(h2(nc+1:nc+n))*tau(ice-nstor+n+1)
-            tmat(n+1,n+1) = tau(ice-nstor+n+1)
-            nc = nc+n
-          enddo
-
-          ! Q = Q - V * T * V**T * Q
-
-          if (l_rows>0) then
-            call zgemm('C','N',nstor,l_cols,l_rows,CONE,hvm,ubound(hvm,dim=1), &
-                        q,ldq,CZERO,tmp1,nstor)
-          else
-            tmp1(1:l_cols*nstor) = 0
-          endif
-          call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
-          if (l_rows>0) then
-            call ztrmm('L','L','N','N',nstor,l_cols,CONE,tmat,max_stored_rows,tmp2,nstor)
-            call zgemm('N','N',l_rows,l_cols,nstor,-CONE,hvm,ubound(hvm,dim=1), &
-                        tmp2,nstor,CONE,q,ldq)
-          endif
-          nstor = 0
-        endif
-
-      enddo
-
-      deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("trans_ev_complex")
-#endif
-
-    end subroutine trans_ev_complex
-
-    subroutine mult_ah_b_complex(uplo_a, uplo_c, na, ncb, a, lda, b, ldb, nblk, mpi_comm_rows, mpi_comm_cols, c, ldc)
-
-    !-------------------------------------------------------------------------------
-    !  mult_ah_b_complex:  Performs C := A**H * B
-    !
-    !      where:  A is a square matrix (na,na) which is optionally upper or lower triangular
-    !              B is a (na,ncb) matrix
-    !              C is a (na,ncb) matrix where optionally only the upper or lower
-    !              triangle may be computed
-    !
-    !  Parameters
-    !
-    !  uplo_a      'U' if A is upper triangular
-    !              'L' if A is lower triangular
-    !              anything else if A is a full matrix
-    !              Please note: This pertains to the original A (as set in the calling program)
-    !              whereas the transpose of A is used for calculations
-    !              If uplo_a is 'U' or 'L', the other triangle is not used at all,
-    !              i.e. it may contain arbitrary numbers
-    !
-    !  uplo_c      'U' if only the upper diagonal part of C is needed
-    !              'L' if only the upper diagonal part of C is needed
-    !              anything else if the full matrix C is needed
-    !              Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
-    !              written to a certain extent, i.e. one shouldn't rely on the content there!
-    !
-    !  na          Number of rows/columns of A, number of rows of B and C
-    !
-    !  ncb         Number of columns  of B and C
-    !
-    !  a           Matrix A
-    !
-    !  lda         Leading dimension of a
-    !
-    !  b           Matrix B
-    !
-    !  ldb         Leading dimension of b
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !  c           Matrix C
-    !
-    !  ldc         Leading dimension of c
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      character*1                   :: uplo_a, uplo_c
-
-      integer(kind=ik)              :: na, ncb, lda, ldb, nblk, mpi_comm_rows, mpi_comm_cols, ldc
-      complex(kind=ck)              :: a(lda,*), b(ldb,*), c(ldc,*) ! remove assumed size!
-
-      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)              :: l_cols, l_rows, l_rows_np
-      integer(kind=ik)              :: np, n, nb, nblk_mult, lrs, lre, lcs, lce
-      integer(kind=ik)              :: gcol_min, gcol, goff
-      integer(kind=ik)              :: nstor, nr_done, noff, np_bc, n_aux_bc, nvals
-      integer(kind=ik), allocatable :: lrs_save(:), lre_save(:)
-
-      logical                       :: a_lower, a_upper, c_lower, c_upper
-
-      complex(kind=ck), allocatable :: aux_mat(:,:), aux_bc(:), tmp1(:,:), tmp2(:,:)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("mult_ah_b_complex")
-#endif
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-      l_rows = local_index(na,  my_prow, np_rows, nblk, -1) ! Local rows of a and b
-      l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b
-
-      ! Block factor for matrix multiplications, must be a multiple of nblk
-
-      if (na/np_rows<=256) then
-        nblk_mult = (31/nblk+1)*nblk
-      else
-        nblk_mult = (63/nblk+1)*nblk
-      endif
-
-      allocate(aux_mat(l_rows,nblk_mult))
-      allocate(aux_bc(l_rows*nblk))
-      allocate(lrs_save(nblk))
-      allocate(lre_save(nblk))
-
-      a_lower = .false.
-      a_upper = .false.
-      c_lower = .false.
-      c_upper = .false.
-
-      if (uplo_a=='u' .or. uplo_a=='U') a_upper = .true.
-      if (uplo_a=='l' .or. uplo_a=='L') a_lower = .true.
-      if (uplo_c=='u' .or. uplo_c=='U') c_upper = .true.
-      if (uplo_c=='l' .or. uplo_c=='L') c_lower = .true.
-
-      ! Build up the result matrix by processor rows
-
-      do np = 0, np_rows-1
-
-        ! In this turn, procs of row np assemble the result
-
-        l_rows_np = local_index(na, np, np_rows, nblk, -1) ! local rows on receiving processors
-
-        nr_done = 0 ! Number of rows done
-        aux_mat = 0
-        nstor = 0   ! Number of columns stored in aux_mat
-
-        ! Loop over the blocks on row np
-
-        do nb=0,(l_rows_np-1)/nblk
-
-          goff  = nb*np_rows + np ! Global offset in blocks corresponding to nb
-
-          ! Get the processor column which owns this block (A is transposed, so we need the column)
-          ! and the offset in blocks within this column.
-          ! The corresponding block column in A is then broadcast to all for multiplication with B
-
-          np_bc = MOD(goff,np_cols)
-          noff = goff/np_cols
-          n_aux_bc = 0
-
-          ! Gather up the complete block column of A on the owner
-
-          do n = 1, min(l_rows_np-nb*nblk,nblk) ! Loop over columns to be broadcast
-
-            gcol = goff*nblk + n ! global column corresponding to n
-            if (nstor==0 .and. n==1) gcol_min = gcol
-
-            lrs = 1       ! 1st local row number for broadcast
-            lre = l_rows  ! last local row number for broadcast
-            if (a_lower) lrs = local_index(gcol, my_prow, np_rows, nblk, +1)
-            if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
-
-            if (lrs<=lre) then
-              nvals = lre-lrs+1
-              if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n)
-              n_aux_bc = n_aux_bc + nvals
-            endif
-
-            lrs_save(n) = lrs
-            lre_save(n) = lre
-
-          enddo
-
-          ! Broadcast block column
-
-          call MPI_Bcast(aux_bc,n_aux_bc,MPI_DOUBLE_COMPLEX,np_bc,mpi_comm_cols,mpierr)
-
-          ! Insert what we got in aux_mat
-
-          n_aux_bc = 0
-          do n = 1, min(l_rows_np-nb*nblk,nblk)
-            nstor = nstor+1
-            lrs = lrs_save(n)
-            lre = lre_save(n)
-            if (lrs<=lre) then
-              nvals = lre-lrs+1
-              aux_mat(lrs:lre,nstor) = aux_bc(n_aux_bc+1:n_aux_bc+nvals)
-              n_aux_bc = n_aux_bc + nvals
-            endif
-          enddo
-
-          ! If we got nblk_mult columns in aux_mat or this is the last block
-          ! do the matrix multiplication
-
-          if (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np) then
-
-            lrs = 1       ! 1st local row number for multiply
-            lre = l_rows  ! last local row number for multiply
-            if (a_lower) lrs = local_index(gcol_min, my_prow, np_rows, nblk, +1)
-            if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
-
-            lcs = 1       ! 1st local col number for multiply
-            lce = l_cols  ! last local col number for multiply
-            if (c_upper) lcs = local_index(gcol_min, my_pcol, np_cols, nblk, +1)
-            if (c_lower) lce = MIN(local_index(gcol, my_pcol, np_cols, nblk, -1),l_cols)
-
-            if (lcs<=lce) then
-              allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce))
-              if (lrs<=lre) then
-                call zgemm('C','N',nstor,lce-lcs+1,lre-lrs+1,(1.d0,0.d0),aux_mat(lrs,1),ubound(aux_mat,dim=1), &
-                             b(lrs,lcs),ldb,(0.d0,0.d0),tmp1,nstor)
-               else
-                 tmp1 = 0
-               endif
-
-               ! Sum up the results and send to processor row np
-               call mpi_reduce(tmp1,tmp2,nstor*(lce-lcs+1),MPI_DOUBLE_COMPLEX,MPI_SUM,np,mpi_comm_rows,mpierr)
-
-               ! Put the result into C
-               if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
-
-               deallocate(tmp1,tmp2)
-            endif
-
-            nr_done = nr_done+nstor
-            nstor=0
-            aux_mat(:,:)=0
-          endif
-        enddo
-      enddo
-
-      deallocate(aux_mat, aux_bc, lrs_save, lre_save)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("mult_ah_b_complex")
-#endif
-
-    end subroutine mult_ah_b_complex
-
-    subroutine solve_tridi( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success )
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)              :: na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-      real(kind=rk)                 :: d(na), e(na)
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk)                 :: q(ldq,*)
-#else
-      real(kind=rk)                 :: q(ldq,matrixCols)
-#endif
-
-      integer(kind=ik)              :: i, j, n, np, nc, nev1, l_cols, l_rows
-      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
-
-      integer(kind=ik), allocatable :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)
-
-      logical, intent(in)           :: wantDebug
-      logical, intent(out)          :: success
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("solve_tridi")
-#endif
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-      success = .true.
-
-      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
-      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
-
-      ! Set Q to 0
-
-      q(1:l_rows, 1:l_cols) = 0.
-
-      ! Get the limits of the subdivisons, each subdivison has as many cols
-      ! as fit on the respective processor column
-
-      allocate(limits(0:np_cols))
-
-      limits(0) = 0
-      do np=0,np_cols-1
-        nc = local_index(na, np, np_cols, nblk, -1) ! number of columns on proc column np
-
-        ! Check for the case that a column has have zero width.
-        ! This is not supported!
-        ! Scalapack supports it but delivers no results for these columns,
-        ! which is rather annoying
-        if (nc==0) then
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%stop("solve_tridi")
-#endif
-          if (wantDebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'
-          success = .false.
-          return
-        endif
-        limits(np+1) = limits(np) + nc
-      enddo
-
-      ! Subdivide matrix by subtracting rank 1 modifications
-
-      do i=1,np_cols-1
-        n = limits(i)
-        d(n) = d(n)-abs(e(n))
-        d(n+1) = d(n+1)-abs(e(n))
-      enddo
-
-      ! Solve sub problems on processsor columns
-
-      nc = limits(my_pcol) ! column after which my problem starts
-
-      if (np_cols>1) then
-        nev1 = l_cols ! all eigenvectors are needed
-      else
-        nev1 = MIN(nev,l_cols)
-      endif
-      call solve_tridi_col(l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk,  &
-                        matrixCols, mpi_comm_rows, wantDebug, success)
-      if (.not.(success)) then
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("solve_tridi")
-#endif
-        return
-      endif
-      ! If there is only 1 processor column, we are done
-
-      if (np_cols==1) then
-        deallocate(limits)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("solve_tridi")
-#endif
-        return
-      endif
-
-      ! Set index arrays for Q columns
-
-      ! Dense distribution scheme:
-
-      allocate(l_col(na))
-      allocate(p_col(na))
-
-      n = 0
-      do np=0,np_cols-1
-        nc = local_index(na, np, np_cols, nblk, -1)
-        do i=1,nc
-          n = n+1
-          l_col(n) = i
-          p_col(n) = np
-        enddo
-      enddo
-
-      ! Block cyclic distribution scheme, only nev columns are set:
-
-      allocate(l_col_bc(na))
-      allocate(p_col_bc(na))
-      p_col_bc(:) = -1
-      l_col_bc(:) = -1
-
-      do i = 0, na-1, nblk*np_cols
-        do j = 0, np_cols-1
-          do n = 1, nblk
-            if (i+j*nblk+n <= MIN(nev,na)) then
-              p_col_bc(i+j*nblk+n) = j
-              l_col_bc(i+j*nblk+n) = i/np_cols + n
-             endif
-           enddo
-         enddo
-      enddo
-
-      ! Recursively merge sub problems
-
-      call merge_recursive(0, np_cols, wantDebug, success)
-      if (.not.(success)) then
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("solve_tridi")
-#endif
-        return
-      endif
-
-      deallocate(limits,l_col,p_col,l_col_bc,p_col_bc)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("solve_tridi")
-#endif
-      return
-
-      contains
-        recursive subroutine merge_recursive(np_off, nprocs, wantDebug, success)
-           use precision
-           implicit none
-
-           ! noff is always a multiple of nblk_ev
-           ! nlen-noff is always > nblk_ev
-
-           integer(kind=ik)     :: np_off, nprocs
-           integer(kind=ik)     :: np1, np2, noff, nlen, nmid, n, &
-                                   mpi_status(mpi_status_size)
-           logical, intent(in)  :: wantDebug
-           logical, intent(out) :: success
-
-           success = .true.
-
-           if (nprocs<=1) then
-             ! Safety check only
-             if (wantDebug) write(error_unit,*) "ELPA1_merge_recursive: INTERNAL error merge_recursive: nprocs=",nprocs
-             success = .false.
-             return
-           endif
-           ! Split problem into 2 subproblems of size np1 / np2
-
-           np1 = nprocs/2
-           np2 = nprocs-np1
-
-           if (np1 > 1) call merge_recursive(np_off, np1, wantDebug, success)
-           if (.not.(success)) return
-           if (np2 > 1) call merge_recursive(np_off+np1, np2, wantDebug, success)
-           if (.not.(success)) return
-
-           noff = limits(np_off)
-           nmid = limits(np_off+np1) - noff
-           nlen = limits(np_off+nprocs) - noff
-
-           if (my_pcol==np_off) then
-             do n=np_off+np1,np_off+nprocs-1
-               call mpi_send(d(noff+1),nmid,MPI_REAL8,n,1,mpi_comm_cols,mpierr)
-             enddo
-           endif
-           if (my_pcol>=np_off+np1 .and. my_pcol<np_off+nprocs) then
-             call mpi_recv(d(noff+1),nmid,MPI_REAL8,np_off,1,mpi_comm_cols,mpi_status,mpierr)
-           endif
-
-           if (my_pcol==np_off+np1) then
-             do n=np_off,np_off+np1-1
-               call mpi_send(d(noff+nmid+1),nlen-nmid,MPI_REAL8,n,1,mpi_comm_cols,mpierr)
-             enddo
-           endif
-           if (my_pcol>=np_off .and. my_pcol<np_off+np1) then
-             call mpi_recv(d(noff+nmid+1),nlen-nmid,MPI_REAL8,np_off+np1,1,mpi_comm_cols,mpi_status,mpierr)
-           endif
-
-           if (nprocs == np_cols) then
-
-             ! Last merge, result distribution must be block cyclic, noff==0,
-             ! p_col_bc is set so that only nev eigenvalues are calculated
-
-             call merge_systems(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
-                                 nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col, p_col, &
-                                 l_col_bc, p_col_bc, np_off, nprocs, wantDebug, success )
-             if (.not.(success)) return
-           else
-             ! Not last merge, leave dense column distribution
-
-             call merge_systems(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
-                                 nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col(noff+1), p_col(noff+1), &
-                                 l_col(noff+1), p_col(noff+1), np_off, nprocs, wantDebug, success )
-             if (.not.(success)) return
-           endif
-
-       end subroutine merge_recursive
-
-    end subroutine solve_tridi
-
-    subroutine solve_tridi_col( na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, wantDebug, success )
-
-   ! Solves the symmetric, tridiagonal eigenvalue problem on one processor column
-   ! with the divide and conquer method.
-   ! Works best if the number of processor rows is a power of 2!
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)              :: na, nev, nqoff, ldq, nblk, matrixCols, mpi_comm_rows
-      real(kind=rk)                 :: d(na), e(na)
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk)                 :: q(ldq,*)
-#else
-      real(kind=rk)                 :: q(ldq,matrixCols)
-#endif
-
-      integer(kind=ik), parameter   :: min_submatrix_size = 16 ! Minimum size of the submatrices to be used
-
-      real(kind=rk), allocatable    :: qmat1(:,:), qmat2(:,:)
-      integer(kind=ik)              :: i, n, np
-      integer(kind=ik)              :: ndiv, noff, nmid, nlen, max_size
-      integer(kind=ik)              :: my_prow, np_rows, mpierr
-
-      integer(kind=ik), allocatable :: limits(:), l_col(:), p_col_i(:), p_col_o(:)
-      logical, intent(in)           :: wantDebug
-      logical, intent(out)          :: success
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("solve_tridi_col")
-#endif
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-
-      success = .true.
-      ! Calculate the number of subdivisions needed.
-
-      n = na
-      ndiv = 1
-      do while(2*ndiv<=np_rows .and. n>2*min_submatrix_size)
-        n = ((n+3)/4)*2 ! the bigger one of the two halves, we want EVEN boundaries
-        ndiv = ndiv*2
-      enddo
-
-      ! If there is only 1 processor row and not all eigenvectors are needed
-      ! and the matrix size is big enough, then use 2 subdivisions
-      ! so that merge_systems is called once and only the needed
-      ! eigenvectors are calculated for the final problem.
-
-      if (np_rows==1 .and. nev<na .and. na>2*min_submatrix_size) ndiv = 2
-
-      allocate(limits(0:ndiv))
-
-      limits(0) = 0
-      limits(ndiv) = na
-
-      n = ndiv
-      do while(n>1)
-        n = n/2 ! n is always a power of 2
-        do i=0,ndiv-1,2*n
-          ! We want to have even boundaries (for cache line alignments)
-          limits(i+n) = limits(i) + ((limits(i+2*n)-limits(i)+3)/4)*2
-        enddo
-      enddo
-
-      ! Calculate the maximum size of a subproblem
-
-      max_size = 0
-      do i=1,ndiv
-        max_size = MAX(max_size,limits(i)-limits(i-1))
-      enddo
-
-      ! Subdivide matrix by subtracting rank 1 modifications
-
-      do i=1,ndiv-1
-        n = limits(i)
-        d(n) = d(n)-abs(e(n))
-        d(n+1) = d(n+1)-abs(e(n))
-      enddo
-
-      if (np_rows==1)    then
-
-        ! For 1 processor row there may be 1 or 2 subdivisions
-
-        do n=0,ndiv-1
-          noff = limits(n)        ! Start of subproblem
-          nlen = limits(n+1)-noff ! Size of subproblem
-
-          call solve_tridi_single(nlen,d(noff+1),e(noff+1), &
-                                    q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success)
-          if (.not.(success)) return
-        enddo
-
-      else
-
-        ! Solve sub problems in parallel with solve_tridi_single
-        ! There is at maximum 1 subproblem per processor
-
-        allocate(qmat1(max_size,max_size))
-        allocate(qmat2(max_size,max_size))
-
-        qmat1 = 0 ! Make sure that all elements are defined
-
-        if (my_prow < ndiv) then
-
-          noff = limits(my_prow)        ! Start of subproblem
-          nlen = limits(my_prow+1)-noff ! Size of subproblem
-
-          call solve_tridi_single(nlen,d(noff+1),e(noff+1),qmat1, &
-                                    ubound(qmat1,dim=1), wantDebug, success)
-
-          if (.not.(success)) return
-        endif
-
-        ! Fill eigenvectors in qmat1 into global matrix q
-
-        do np = 0, ndiv-1
-
-          noff = limits(np)
-          nlen = limits(np+1)-noff
-
-          call MPI_Bcast(d(noff+1),nlen,MPI_REAL8,np,mpi_comm_rows,mpierr)
-          qmat2 = qmat1
-          call MPI_Bcast(qmat2,max_size*max_size,MPI_REAL8,np,mpi_comm_rows,mpierr)
-
-          do i=1,nlen
-            call distribute_global_column(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
-          enddo
-
-        enddo
-
-        deallocate(qmat1, qmat2)
-
-      endif
-
-      ! Allocate and set index arrays l_col and p_col
-
-      allocate(l_col(na), p_col_i(na),  p_col_o(na))
-
-      do i=1,na
-        l_col(i) = i
-        p_col_i(i) = 0
-        p_col_o(i) = 0
-      enddo
-
-      ! Merge subproblems
-
-      n = 1
-      do while(n<ndiv) ! if ndiv==1, the problem was solved by single call to solve_tridi_single
-
-        do i=0,ndiv-1,2*n
-
-          noff = limits(i)
-          nmid = limits(i+n) - noff
-          nlen = limits(i+2*n) - noff
-
-          if (nlen == na) then
-            ! Last merge, set p_col_o=-1 for unneeded (output) eigenvectors
-            p_col_o(nev+1:na) = -1
-          endif
-
-          call merge_systems(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
-                               matrixCols, mpi_comm_rows, mpi_comm_self, l_col(noff+1), p_col_i(noff+1), &
-                               l_col(noff+1), p_col_o(noff+1), 0, 1, wantDebug, success)
-          if (.not.(success)) return
-
-        enddo
-
-        n = 2*n
-
-      enddo
-
-      deallocate(limits, l_col, p_col_i, p_col_o)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("solve_tridi_col")
-#endif
-
-    end subroutine solve_tridi_col
-
-    subroutine solve_tridi_single(nlen, d, e, q, ldq, wantDebug, success)
-
-   ! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.
-   ! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.
-#ifdef HAVE_DETAILED_TIMINGS
-     use timings
-#endif
-     use precision
-     implicit none
-
-     integer(kind=ik)              :: nlen, ldq
-     real(kind=rk)                 :: d(nlen), e(nlen), q(ldq,nlen)
-
-     real(kind=rk), allocatable    :: work(:), qtmp(:), ds(:), es(:)
-     real(kind=rk)                 :: dtmp
-
-     integer(kind=ik)              :: i, j, lwork, liwork, info, mpierr
-     integer(kind=ik), allocatable :: iwork(:)
-
-     logical, intent(in)           :: wantDebug
-     logical, intent(out)          :: success
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%start("solve_tridi_single")
-#endif
-
-     success = .true.
-     allocate(ds(nlen), es(nlen))
-
-     ! Save d and e for the case that dstedc fails
-
-     ds(:) = d(:)
-     es(:) = e(:)
-
-     ! First try dstedc, this is normally faster but it may fail sometimes (why???)
-
-     lwork = 1 + 4*nlen + nlen**2
-     liwork =  3 + 5*nlen
-     allocate(work(lwork), iwork(liwork))
-     call dstedc('I',nlen,d,e,q,ldq,work,lwork,iwork,liwork,info)
-
-     if (info /= 0) then
-
-       ! DSTEDC failed, try DSTEQR. The workspace is enough for DSTEQR.
-
-       write(error_unit,'(a,i8,a)') 'Warning: Lapack routine DSTEDC failed, info= ',info,', Trying DSTEQR!'
-
-       d(:) = ds(:)
-       e(:) = es(:)
-       call dsteqr('I',nlen,d,e,q,ldq,work,info)
-
-       ! If DSTEQR fails also, we don't know what to do further ...
-
-       if (info /= 0) then
-         if (wantDebug) &
-           write(error_unit,'(a,i8,a)') 'ELPA1_solve_tridi_single: ERROR: Lapack routine DSTEQR failed, info= ',info,', Aborting!'
-           success = .false.
-           return
-         endif
-       end if
-
-       deallocate(work,iwork,ds,es)
-
-      ! Check if eigenvalues are monotonically increasing
-      ! This seems to be not always the case  (in the IBM implementation of dstedc ???)
-
-      do i=1,nlen-1
-        if (d(i+1)<d(i)) then
-          if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1d-14) then
-            write(error_unit,'(a,i8,2g25.16)') '***WARNING: Monotony error dste**:',i+1,d(i),d(i+1)
-          else
-            write(error_unit,'(a,i8,2g25.16)') 'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1)
-            write(error_unit,'(a)') 'The eigenvalues from a lapack call are not sorted to machine precision.'
-            write(error_unit,'(a)') 'In this extent, this is completely harmless.'
-            write(error_unit,'(a)') 'Still, we keep this info message just in case.'
-          end if
-          allocate(qtmp(nlen))
-          dtmp = d(i+1)
-          qtmp(1:nlen) = q(1:nlen,i+1)
-          do j=i,1,-1
-            if (dtmp<d(j)) then
-              d(j+1)        = d(j)
-              q(1:nlen,j+1) = q(1:nlen,j)
-            else
-              exit ! Loop
-            endif
-          enddo
-          d(j+1)        = dtmp
-          q(1:nlen,j+1) = qtmp(1:nlen)
-          deallocate(qtmp)
-       endif
-     enddo
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop("solve_tridi_single")
-#endif
-
-    end subroutine solve_tridi_single
-
-    subroutine merge_systems( na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
-                          l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, wantDebug, success)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)              :: na, nm, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, &
-                                       mpi_comm_cols, npc_0, npc_n
-      integer(kind=ik)              :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
-      real(kind=rk)                 :: d(na), e
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk)                 :: q(ldq,*)
-#else
-      real(kind=rk)                 :: q(ldq,matrixCols)
-#endif
-
-      integer(kind=ik), parameter   :: max_strip=128
-
-      real(kind=rk)                 :: beta, sig, s, c, t, tau, rho, eps, tol, dlamch, &
-                                       dlapy2, qtrans(2,2), dmax, zmax, d1new, d2new
-      real(kind=rk)                 :: z(na), d1(na), d2(na), z1(na), delta(na),  &
-                                       dbase(na), ddiff(na), ev_scale(na), tmp(na)
-      real(kind=rk)                 :: d1u(na), zu(na), d1l(na), zl(na)
-      real(kind=rk), allocatable    :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
-#ifdef WITH_OPENMP
-      real(kind=rk), allocatable    :: z_p(:,:)
-#endif
-
-      integer(kind=ik)              :: i, j, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
-                                       l_rqm, ns, info
-      integer(kind=ik)              :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
-                                       l_cols_qreorg, np, l_idx, nqcols1, nqcols2
-      integer(kind=ik)              :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
-                                       np_cols, mpierr, mpi_status(mpi_status_size)
-      integer(kind=ik)              :: np_next, np_prev, np_rem
-      integer(kind=ik)              :: idx(na), idx1(na), idx2(na)
-      integer(kind=ik)              :: coltyp(na), idxq1(na), idxq2(na)
-
-      logical, intent(in)           :: wantDebug
-      logical, intent(out)          :: success
-#ifdef WITH_OPENMP
-      integer(kind=ik)              :: max_threads, my_thread
-      integer(kind=ik)              :: omp_get_max_threads, omp_get_thread_num
-
-      max_threads = omp_get_max_threads()
-
-      allocate(z_p(na,0:max_threads-1))
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("merge_systems")
-#endif
-      success = .true.
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-      ! If my processor column isn't in the requested set, do nothing
-
-      if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("merge_systems")
-#endif
-        return
-      endif
-      ! Determine number of "next" and "prev" column for ring sends
-
-      if (my_pcol == npc_0+npc_n-1) then
-        np_next = npc_0
-      else
-        np_next = my_pcol + 1
-      endif
-
-      if (my_pcol == npc_0) then
-        np_prev = npc_0+npc_n-1
-      else
-        np_prev = my_pcol - 1
-      endif
-
-      call check_monotony(nm,d,'Input1',wantDebug, success)
-      if (.not.(success)) then
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("merge_systems")
-#endif
-        return
-      endif
-      call check_monotony(na-nm,d(nm+1),'Input2',wantDebug, success)
-      if (.not.(success)) then
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("merge_systems")
-#endif
-        return
-      endif
-      ! Get global number of processors and my processor number.
-      ! Please note that my_proc does not need to match any real processor number,
-      ! it is just used for load balancing some loops.
-
-      n_procs = np_rows*npc_n
-      my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major
-
-
-      ! Local limits of the rows of Q
-
-      l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q
-      l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm
-      l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q
-
-      l_rnm  = l_rqm-l_rqs+1 ! Number of local rows <= nm
-      l_rows = l_rqe-l_rqs+1 ! Total number of local rows
-
-
-      ! My number of local columns
-
-      l_cols = COUNT(p_col(1:na)==my_pcol)
-
-      ! Get max number of local columns
-
-      max_local_cols = 0
-      do np = npc_0, npc_0+npc_n-1
-        max_local_cols = MAX(max_local_cols,COUNT(p_col(1:na)==np))
-      enddo
-
-      ! Calculations start here
-
-      beta = abs(e)
-      sig  = sign(1.d0,e)
-
-      ! Calculate rank-1 modifier z
-
-      z(:) = 0
-
-      if (MOD((nqoff+nm-1)/nblk,np_rows)==my_prow) then
-        ! nm is local on my row
-        do i = 1, na
-          if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i))
-         enddo
-      endif
-
-      if (MOD((nqoff+nm)/nblk,np_rows)==my_prow) then
-        ! nm+1 is local on my row
-        do i = 1, na
-          if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i))
-        enddo
-      endif
-
-      call global_gather(z, na)
-
-      ! Normalize z so that norm(z) = 1.  Since z is the concatenation of
-      ! two normalized vectors, norm2(z) = sqrt(2).
-
-      z = z/sqrt(2.0d0)
-      rho = 2.*beta
-
-      ! Calculate index for merging both systems by ascending eigenvalues
-
-      call DLAMRG( nm, na-nm, d, 1, 1, idx )
-
-      ! Calculate the allowable deflation tolerance
-
-      zmax = maxval(abs(z))
-      dmax = maxval(abs(d))
-      EPS = DLAMCH( 'Epsilon' )
-      TOL = 8.*EPS*MAX(dmax,zmax)
-
-      ! If the rank-1 modifier is small enough, no more needs to be done
-      ! except to reorganize D and Q
-
-      IF ( RHO*zmax <= TOL ) THEN
-
-        ! Rearrange eigenvalues
-
-        tmp = d
-        do i=1,na
-          d(i) = tmp(idx(i))
-        enddo
-
-        ! Rearrange eigenvectors
-
-        call resort_ev(idx, na)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("merge_systems")
-#endif
-
-        return
-      ENDIF
-
-      ! Merge and deflate system
-
-      na1 = 0
-      na2 = 0
-
-      ! COLTYP:
-      ! 1 : non-zero in the upper half only;
-      ! 2 : dense;
-      ! 3 : non-zero in the lower half only;
-      ! 4 : deflated.
-
-      coltyp(1:nm) = 1
-      coltyp(nm+1:na) = 3
-
-      do i=1,na
-
-        if (rho*abs(z(idx(i))) <= tol) then
-
-          ! Deflate due to small z component.
-
-          na2 = na2+1
-          d2(na2)   = d(idx(i))
-          idx2(na2) = idx(i)
-          coltyp(idx(i)) = 4
-
-        else if (na1>0) then
-
-          ! Check if eigenvalues are close enough to allow deflation.
-
-          S = Z(idx(i))
-          C = Z1(na1)
-
-          ! Find sqrt(a**2+b**2) without overflow or
-          ! destructive underflow.
-
-          TAU = DLAPY2( C, S )
-          T = D1(na1) - D(idx(i))
-          C = C / TAU
-          S = -S / TAU
-          IF ( ABS( T*C*S ) <= TOL ) THEN
-
-            ! Deflation is possible.
-
-            na2 = na2+1
-
-            Z1(na1) = TAU
-
-            d2new = D(idx(i))*C**2 + D1(na1)*S**2
-            d1new = D(idx(i))*S**2 + D1(na1)*C**2
-
-            ! D(idx(i)) >= D1(na1) and C**2 + S**2 == 1.0
-            ! This means that after the above transformation it must be
-            !    D1(na1) <= d1new <= D(idx(i))
-            !    D1(na1) <= d2new <= D(idx(i))
-            !
-            ! D1(na1) may get bigger but it is still smaller than the next D(idx(i+1))
-            ! so there is no problem with sorting here.
-            ! d2new <= D(idx(i)) which means that it might be smaller than D2(na2-1)
-            ! which makes a check (and possibly a resort) necessary.
-            !
-            ! The above relations may not hold exactly due to numeric differences
-            ! so they have to be enforced in order not to get troubles with sorting.
-
-
-            if (d1new<D1(na1)  ) d1new = D1(na1)
-            if (d1new>D(idx(i))) d1new = D(idx(i))
-
-            if (d2new<D1(na1)  ) d2new = D1(na1)
-            if (d2new>D(idx(i))) d2new = D(idx(i))
-
-            D1(na1) = d1new
-
-            do j=na2-1,1,-1
-              if (d2new<d2(j)) then
-                d2(j+1)   = d2(j)
-                idx2(j+1) = idx2(j)
-              else
-                exit ! Loop
-              endif
-            enddo
-
-            d2(j+1)   = d2new
-            idx2(j+1) = idx(i)
-
-            qtrans(1,1) = C; qtrans(1,2) =-S
-            qtrans(2,1) = S; qtrans(2,2) = C
-
-            call transform_columns(idx(i), idx1(na1))
-
-            if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2
-            if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2
-
-            coltyp(idx(i)) = 4
-
-          else
-            na1 = na1+1
-            d1(na1) = d(idx(i))
-            z1(na1) = z(idx(i))
-            idx1(na1) = idx(i)
-          endif
-        else
-          na1 = na1+1
-          d1(na1) = d(idx(i))
-          z1(na1) = z(idx(i))
-          idx1(na1) = idx(i)
-        endif
-
-      enddo
-      call check_monotony(na1,d1,'Sorted1', wantDebug, success)
-      if (.not.(success)) then
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("merge_systems")
-#endif
-        return
-      endif
-      call check_monotony(na2,d2,'Sorted2', wantDebug, success)
-      if (.not.(success)) then
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("merge_systems")
-#endif
-        return
-      endif
-
-      if (na1==1 .or. na1==2) then
-        ! if(my_proc==0) print *,'--- Remark solve_tridi: na1==',na1,' proc==',myid
-
-        if (na1==1) then
-          d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation
-        else ! na1==2
-          call DLAED5(1, d1, z1, qtrans(1,1), rho, d(1))
-          call DLAED5(2, d1, z1, qtrans(1,2), rho, d(2))
-
-          call transform_columns(idx1(1), idx1(2))
-        endif
-
-        ! Add the deflated eigenvalues
-        d(na1+1:na) = d2(1:na2)
-
-        ! Calculate arrangement of all eigenvalues  in output
-
-        call DLAMRG( na1, na-na1, d, 1, 1, idx )
-
-        ! Rearrange eigenvalues
-
-        tmp = d
-        do i=1,na
-          d(i) = tmp(idx(i))
-        enddo
-
-        ! Rearrange eigenvectors
-
-        do i=1,na
-          if (idx(i)<=na1) then
-            idxq1(i) = idx1(idx(i))
-          else
-            idxq1(i) = idx2(idx(i)-na1)
-          endif
-        enddo
-
-        call resort_ev(idxq1, na)
-
-      else if (na1>2) then
-
-        ! Solve secular equation
-
-        z(1:na1) = 1
-#ifdef WITH_OPENMP
-        z_p(1:na1,:) = 1
-#endif
-        dbase(1:na1) = 0
-        ddiff(1:na1) = 0
-
-        info = 0
-#ifdef WITH_OPENMP
-
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("OpenMP parallel")
-#endif
-
-!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j)
-        my_thread = omp_get_thread_num()
-!$OMP DO
-#endif
-        DO i = my_proc+1, na1, n_procs ! work distributed over all processors
-
-          call DLAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used!
-
-          if (info/=0) then
-            ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)
-            ! use the more stable bisection algorithm in solve_secular_equation
-            ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection'
-            call solve_secular_equation(na1, i, d1, z1, delta, rho, s)
-          endif
-
-          ! Compute updated z
-
-#ifdef WITH_OPENMP
-          do j=1,na1
-            if (i/=j)  z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) )
-          enddo
-          z_p(i,my_thread) = z_p(i,my_thread)*delta(i)
-#else
-          do j=1,na1
-            if (i/=j)  z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) )
-          enddo
-          z(i) = z(i)*delta(i)
-#endif
-          ! store dbase/ddiff
-
-          if (i<na1) then
-            if (abs(delta(i+1)) < abs(delta(i))) then
-              dbase(i) = d1(i+1)
-              ddiff(i) = delta(i+1)
-            else
-              dbase(i) = d1(i)
-              ddiff(i) = delta(i)
-            endif
-          else
-            dbase(i) = d1(i)
-            ddiff(i) = delta(i)
-          endif
-        enddo
-#ifdef WITH_OPENMP
-!$OMP END PARALLEL
-
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("OpenMP parallel")
-#endif
-
-        do i = 0, max_threads-1
-          z(1:na1) = z(1:na1)*z_p(1:na1,i)
-        enddo
-#endif
-
-        call global_product(z, na1)
-        z(1:na1) = SIGN( SQRT( -z(1:na1) ), z1(1:na1) )
-
-        call global_gather(dbase, na1)
-        call global_gather(ddiff, na1)
-        d(1:na1) = dbase(1:na1) - ddiff(1:na1)
-
-        ! Calculate scale factors for eigenvectors
-
-        ev_scale(:) = 0.
-
-#ifdef WITH_OPENMP
-
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("OpenMP parallel")
-#endif
-
-!$OMP PARALLEL DO PRIVATE(i) SHARED(na1, my_proc, n_procs,  &
-!$OMP d1,dbase, ddiff, z, ev_scale) &
-!$OMP DEFAULT(NONE)
-
-#endif
-        DO i = my_proc+1, na1, n_procs ! work distributed over all processors
-
-          ! tmp(1:na1) = z(1:na1) / delta(1:na1,i)  ! original code
-          ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results
-
-          ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)
-          ! in exactly this order, but we want to prevent compiler optimization
-!         ev_scale_val = ev_scale(i)
-          call add_tmp(d1, dbase, ddiff, z, ev_scale(i), na1,i)
-!         ev_scale(i) = ev_scale_val
-        enddo
-#ifdef WITH_OPENMP
-!$OMP END PARALLEL DO
-
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("OpenMP parallel")
-#endif
-
-#endif
-
-        call global_gather(ev_scale, na1)
-
-        ! Add the deflated eigenvalues
-        d(na1+1:na) = d2(1:na2)
-
-        ! Calculate arrangement of all eigenvalues  in output
-
-        call DLAMRG( na1, na-na1, d, 1, 1, idx )
-
-        ! Rearrange eigenvalues
-
-        tmp = d
-        do i=1,na
-          d(i) = tmp(idx(i))
-        enddo
-        call check_monotony(na,d,'Output', wantDebug, success)
-        if (.not.(success)) then
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%stop("merge_systems")
-#endif
-          return
-        endif
-        ! Eigenvector calculations
-
-
-        ! Calculate the number of columns in the new local matrix Q
-        ! which are updated from non-deflated/deflated eigenvectors.
-        ! idxq1/2 stores the global column numbers.
-
-        nqcols1 = 0 ! number of non-deflated eigenvectors
-        nqcols2 = 0 ! number of deflated eigenvectors
-        DO i = 1, na
-          if (p_col_out(i)==my_pcol) then
-            if (idx(i)<=na1) then
-              nqcols1 = nqcols1+1
-              idxq1(nqcols1) = i
-            else
-              nqcols2 = nqcols2+1
-              idxq2(nqcols2) = i
-            endif
-          endif
-        enddo
-
-        allocate(ev(max_local_cols,MIN(max_strip,MAX(1,nqcols1))))
-        allocate(qtmp1(MAX(1,l_rows),max_local_cols))
-        allocate(qtmp2(MAX(1,l_rows),MIN(max_strip,MAX(1,nqcols1))))
-
-        ! Gather nonzero upper/lower components of old matrix Q
-        ! which are needed for multiplication with new eigenvectors
-
-        qtmp1 = 0 ! May contain empty (unset) parts
-        qtmp2 = 0 ! Not really needed
-
-        nnzu = 0
-        nnzl = 0
-        do i = 1, na1
-          l_idx = l_col(idx1(i))
-          if (p_col(idx1(i))==my_pcol) then
-            if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then
-              nnzu = nnzu+1
-              qtmp1(1:l_rnm,nnzu) = q(l_rqs:l_rqm,l_idx)
-            endif
-            if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then
-              nnzl = nnzl+1
-              qtmp1(l_rnm+1:l_rows,nnzl) = q(l_rqm+1:l_rqe,l_idx)
-            endif
-          endif
-        enddo
-
-        ! Gather deflated eigenvalues behind nonzero components
-
-        ndef = max(nnzu,nnzl)
-        do i = 1, na2
-          l_idx = l_col(idx2(i))
-          if (p_col(idx2(i))==my_pcol) then
-            ndef = ndef+1
-            qtmp1(1:l_rows,ndef) = q(l_rqs:l_rqe,l_idx)
-          endif
-        enddo
-
-        l_cols_qreorg = ndef ! Number of columns in reorganized matrix
-
-        ! Set (output) Q to 0, it will sum up new Q
-
-        DO i = 1, na
-          if(p_col_out(i)==my_pcol) q(l_rqs:l_rqe,l_col_out(i)) = 0
-        enddo
-
-        np_rem = my_pcol
-
-        do np = 1, npc_n
-
-          ! Do a ring send of qtmp1
-
-          if (np>1) then
-
-            if (np_rem==npc_0) then
-              np_rem = npc_0+npc_n-1
-            else
-              np_rem = np_rem-1
-            endif
-
-            call MPI_Sendrecv_replace(qtmp1, l_rows*max_local_cols, MPI_REAL8, &
-                                        np_next, 1111, np_prev, 1111, &
-                                        mpi_comm_cols, mpi_status, mpierr)
-          endif
-
-          ! Gather the parts in d1 and z which are fitting to qtmp1.
-          ! This also delivers nnzu/nnzl for proc np_rem
-
-          nnzu = 0
-          nnzl = 0
-          do i=1,na1
-            if (p_col(idx1(i))==np_rem) then
-              if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then
-                nnzu = nnzu+1
-                d1u(nnzu) = d1(i)
-                zu (nnzu) = z (i)
-              endif
-              if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then
-                nnzl = nnzl+1
-                d1l(nnzl) = d1(i)
-                zl (nnzl) = z (i)
-              endif
-            endif
-          enddo
-
-          ! Set the deflated eigenvectors in Q (comming from proc np_rem)
-
-          ndef = MAX(nnzu,nnzl) ! Remote counter in input matrix
-          do i = 1, na
-            j = idx(i)
-            if (j>na1) then
-              if (p_col(idx2(j-na1))==np_rem) then
-                ndef = ndef+1
-                if (p_col_out(i)==my_pcol) &
-                      q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef)
-              endif
-            endif
-          enddo
-
-          do ns = 0, nqcols1-1, max_strip ! strimining loop
-
-            ncnt = MIN(max_strip,nqcols1-ns) ! number of columns in this strip
-
-            ! Get partial result from (output) Q
-
-            do i = 1, ncnt
-              qtmp2(1:l_rows,i) = q(l_rqs:l_rqe,l_col_out(idxq1(i+ns)))
-            enddo
-
-            ! Compute eigenvectors of the rank-1 modified matrix.
-            ! Parts for multiplying with upper half of Q:
-
-            do i = 1, ncnt
-              j = idx(idxq1(i+ns))
-              ! Calculate the j-th eigenvector of the deflated system
-              ! See above why we are doing it this way!
-              tmp(1:nnzu) = d1u(1:nnzu)-dbase(j)
-              call v_add_s(tmp,nnzu,ddiff(j))
-              ev(1:nnzu,i) = zu(1:nnzu) / tmp(1:nnzu) * ev_scale(j)
-            enddo
-
-            ! Multiply old Q with eigenvectors (upper half)
-
-            if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) &
-                call dgemm('N','N',l_rnm,ncnt,nnzu,1.d0,qtmp1,ubound(qtmp1,dim=1),ev,ubound(ev,dim=1), &
-                           1.d0,qtmp2(1,1),ubound(qtmp2,dim=1))
-
-            ! Compute eigenvectors of the rank-1 modified matrix.
-            ! Parts for multiplying with lower half of Q:
-
-            do i = 1, ncnt
-              j = idx(idxq1(i+ns))
-              ! Calculate the j-th eigenvector of the deflated system
-              ! See above why we are doing it this way!
-              tmp(1:nnzl) = d1l(1:nnzl)-dbase(j)
-              call v_add_s(tmp,nnzl,ddiff(j))
-              ev(1:nnzl,i) = zl(1:nnzl) / tmp(1:nnzl) * ev_scale(j)
-            enddo
-
-            ! Multiply old Q with eigenvectors (lower half)
-
-             if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) &
-                call dgemm('N','N',l_rows-l_rnm,ncnt,nnzl,1.d0,qtmp1(l_rnm+1,1),ubound(qtmp1,dim=1),ev,ubound(ev,dim=1), &
-                           1.d0,qtmp2(l_rnm+1,1),ubound(qtmp2,dim=1))
-
-             ! Put partial result into (output) Q
-
-             do i = 1, ncnt
-               q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i)
-             enddo
-
-           enddo
-        enddo
-
-        deallocate(ev, qtmp1, qtmp2)
-
-      endif
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("merge_systems")
-#endif
-      return
-
-      contains
-        subroutine add_tmp(d1, dbase, ddiff, z, ev_scale_value, na1,i)
-          use precision
-          implicit none
-
-          integer(kind=ik), intent(in) :: na1, i
-
-          real(kind=rk), intent(in)    :: d1(:), dbase(:), ddiff(:), z(:)
-          real(kind=rk), intent(inout) :: ev_scale_value
-          real(kind=rk)                :: tmp(1:na1)
-
-               ! tmp(1:na1) = z(1:na1) / delta(1:na1,i)  ! original code
-               ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results
-
-               ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)
-               ! in exactly this order, but we want to prevent compiler optimization
-
-          tmp(1:na1) = d1(1:na1) -dbase(i)
-          call v_add_s(tmp(1:na1),na1,ddiff(i))
-
-          tmp(1:na1) = z(1:na1) / tmp(1:na1)
-
-          ev_scale_value = 1.0/sqrt(dot_product(tmp(1:na1),tmp(1:na1)))
-
-        end subroutine add_tmp
-
-        subroutine resort_ev(idx_ev, nLength)
-          use precision
-          implicit none
-
-          integer(kind=ik), intent(in) :: nLength
-          integer(kind=ik)             :: idx_ev(nLength)
-          integer(kind=ik)             :: i, nc, pc1, pc2, lc1, lc2, l_cols_out
-
-          real(kind=rk), allocatable   :: qtmp(:,:)
-
-          if (l_rows==0) return ! My processor column has no work to do
-
-          ! Resorts eigenvectors so that q_new(:,i) = q_old(:,idx_ev(i))
-
-          l_cols_out = COUNT(p_col_out(1:na)==my_pcol)
-          allocate(qtmp(l_rows,l_cols_out))
-
-          nc = 0
-
-          do i=1,na
-
-            pc1 = p_col(idx_ev(i))
-            lc1 = l_col(idx_ev(i))
-            pc2 = p_col_out(i)
-
-            if (pc2<0) cycle ! This column is not needed in output
-
-            if (pc2==my_pcol) nc = nc+1 ! Counter for output columns
-
-            if (pc1==my_pcol) then
-              if (pc2==my_pcol) then
-                ! send and recieve column are local
-                qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1)
-              else
-                call mpi_send(q(l_rqs,lc1),l_rows,MPI_REAL8,pc2,mod(i,4096),mpi_comm_cols,mpierr)
-              endif
-            else if (pc2==my_pcol) then
-              call mpi_recv(qtmp(1,nc),l_rows,MPI_REAL8,pc1,mod(i,4096),mpi_comm_cols,mpi_status,mpierr)
-            endif
-          enddo
-
-          ! Insert qtmp into (output) q
-
-          nc = 0
-
-          do i=1,na
-
-            pc2 = p_col_out(i)
-            lc2 = l_col_out(i)
-
-            if (pc2==my_pcol) then
-              nc = nc+1
-              q(l_rqs:l_rqe,lc2) = qtmp(1:l_rows,nc)
-            endif
-          enddo
-
-          deallocate(qtmp)
-
-        end subroutine resort_ev
-
-        subroutine transform_columns(col1, col2)
-          use precision
-          implicit none
-
-          integer(kind=ik) :: col1, col2
-          integer(kind=ik) :: pc1, pc2, lc1, lc2
-
-          if (l_rows==0) return ! My processor column has no work to do
-
-          pc1 = p_col(col1)
-          lc1 = l_col(col1)
-          pc2 = p_col(col2)
-          lc2 = l_col(col2)
-
-          if (pc1==my_pcol) then
-            if (pc2==my_pcol) then
-              ! both columns are local
-              tmp(1:l_rows)      = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + q(l_rqs:l_rqe,lc2)*qtrans(2,1)
-              q(l_rqs:l_rqe,lc2) = q(l_rqs:l_rqe,lc1)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2)
-              q(l_rqs:l_rqe,lc1) = tmp(1:l_rows)
-            else
-              call mpi_sendrecv(q(l_rqs,lc1),l_rows,MPI_REAL8,pc2,1, &
-                                  tmp,l_rows,MPI_REAL8,pc2,1, &
-                                  mpi_comm_cols,mpi_status,mpierr)
-              q(l_rqs:l_rqe,lc1) = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + tmp(1:l_rows)*qtrans(2,1)
-            endif
-          else if (pc2==my_pcol) then
-            call mpi_sendrecv(q(l_rqs,lc2),l_rows,MPI_REAL8,pc1,1, &
-                               tmp,l_rows,MPI_REAL8,pc1,1, &
-                               mpi_comm_cols,mpi_status,mpierr)
-            q(l_rqs:l_rqe,lc2) = tmp(1:l_rows)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2)
-          endif
-
-        end subroutine transform_columns
-
-        subroutine global_gather(z, n)
-
-          ! This routine sums up z over all processors.
-          ! It should only be used for gathering distributed results,
-          ! i.e. z(i) should be nonzero on exactly 1 processor column,
-          ! otherways the results may be numerically different on different columns
-          use precision
-          implicit none
-
-          integer(kind=ik) :: n
-          real(kind=rk)    :: z(n)
-          real(kind=rk)    :: tmp(n)
-
-          if (npc_n==1 .and. np_rows==1) return ! nothing to do
-
-          ! Do an mpi_allreduce over processor rows
-
-          call mpi_allreduce(z, tmp, n, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
-
-          ! If only 1 processor column, we are done
-          if (npc_n==1) then
-            z(:) = tmp(:)
-            return
-          endif
-
-          ! If all processor columns are involved, we can use mpi_allreduce
-          if (npc_n==np_cols) then
-            call mpi_allreduce(tmp, z, n, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr)
-            return
-          endif
-
-          ! Do a ring send over processor columns
-          z(:) = 0
-          do np = 1, npc_n
-            z(:) = z(:) + tmp(:)
-            call MPI_Sendrecv_replace(z, n, MPI_REAL8, np_next, 1111, np_prev, 1111, &
-                                       mpi_comm_cols, mpi_status, mpierr)
-          enddo
-
-        end subroutine global_gather
-
-        subroutine global_product(z, n)
-
-          ! This routine calculates the global product of z.
-          use precision
-          implicit none
-
-          integer(kind=ik) :: n
-          real(kind=rk)    :: z(n)
-
-          real(kind=rk)    :: tmp(n)
-
-          if (npc_n==1 .and. np_rows==1) return ! nothing to do
-
-          ! Do an mpi_allreduce over processor rows
-
-          call mpi_allreduce(z, tmp, n, MPI_REAL8, MPI_PROD, mpi_comm_rows, mpierr)
-
-          ! If only 1 processor column, we are done
-          if (npc_n==1) then
-            z(:) = tmp(:)
-            return
-          endif
-
-          ! If all processor columns are involved, we can use mpi_allreduce
-          if (npc_n==np_cols) then
-            call mpi_allreduce(tmp, z, n, MPI_REAL8, MPI_PROD, mpi_comm_cols, mpierr)
-            return
-          endif
-
-          ! We send all vectors to the first proc, do the product there
-          ! and redistribute the result.
-
-          if (my_pcol == npc_0) then
-            z(1:n) = tmp(1:n)
-            do np = npc_0+1, npc_0+npc_n-1
-              call mpi_recv(tmp,n,MPI_REAL8,np,1111,mpi_comm_cols,mpi_status,mpierr)
-              z(1:n) = z(1:n)*tmp(1:n)
-            enddo
-            do np = npc_0+1, npc_0+npc_n-1
-              call mpi_send(z,n,MPI_REAL8,np,1111,mpi_comm_cols,mpierr)
-            enddo
-          else
-            call mpi_send(tmp,n,MPI_REAL8,npc_0,1111,mpi_comm_cols,mpierr)
-            call mpi_recv(z  ,n,MPI_REAL8,npc_0,1111,mpi_comm_cols,mpi_status,mpierr)
-          endif
-
-        end subroutine global_product
-
-        subroutine check_monotony(n,d,text, wantDebug, success)
-
-        ! This is a test routine for checking if the eigenvalues are monotonically increasing.
-        ! It is for debug purposes only, an error should never be triggered!
-          use precision
-          implicit none
-
-          integer(kind=ik)              :: n
-          real(kind=rk)                 :: d(n)
-          character*(*)                 :: text
-
-          integer(kind=ik)              :: i
-          logical, intent(in)           :: wantDebug
-          logical, intent(out)          :: success
-
-          success = .true.
-          do i=1,n-1
-            if (d(i+1)<d(i)) then
-              if (wantDebug) write(error_unit,'(a,a,i8,2g25.17)') 'ELPA1_check_monotony: Monotony error on ',text,i,d(i),d(i+1)
-              success = .false.
-              return
-            endif
-          enddo
-        end subroutine check_monotony
-
-    end subroutine merge_systems
-
-    subroutine v_add_s(v,n,s)
-      use precision
-      implicit none
-      integer(kind=ik) :: n
-      real(kind=rk)    :: v(n),s
-
-      v(:) = v(:) + s
-    end subroutine v_add_s
-
-    subroutine distribute_global_column(g_col, l_col, noff, nlen, my_prow, np_rows, nblk)
-      use precision
-      implicit none
-
-      real(kind=rk)     :: g_col(nlen), l_col(*) ! chnage this to proper 2d 1d matching
-      integer(kind=ik)  :: noff, nlen, my_prow, np_rows, nblk
-
-      integer(kind=ik)  :: nbs, nbe, jb, g_off, l_off, js, je
-
-      nbs = noff/(nblk*np_rows)
-      nbe = (noff+nlen-1)/(nblk*np_rows)
-
-      do jb = nbs, nbe
-
-        g_off = jb*nblk*np_rows + nblk*my_prow
-        l_off = jb*nblk
-
-        js = MAX(noff+1-g_off,1)
-        je = MIN(noff+nlen-g_off,nblk)
-
-        if (je<js) cycle
-
-        l_col(l_off+js:l_off+je) = g_col(g_off+js-noff:g_off+je-noff)
-
-      enddo
-
-    end subroutine distribute_global_column
-
-    subroutine solve_secular_equation(n, i, d, z, delta, rho, dlam)
-
-    !-------------------------------------------------------------------------------
-    ! This routine solves the secular equation of a symmetric rank 1 modified
-    ! diagonal matrix:
-    !
-    !    1. + rho*SUM(z(:)**2/(d(:)-x)) = 0
-    !
-    ! It does the same as the LAPACK routine DLAED4 but it uses a bisection technique
-    ! which is more robust (it always yields a solution) but also slower
-    ! than the algorithm used in DLAED4.
-    !
-    ! The same restictions than in DLAED4 hold, namely:
-    !
-    !   rho > 0   and   d(i+1) > d(i)
-    !
-    ! but this routine will not terminate with error if these are not satisfied
-    ! (it will normally converge to a pole in this case).
-    !
-    ! The output in DELTA(j) is always (D(j) - lambda_I), even for the cases
-    ! N=1 and N=2 which is not compatible with DLAED4.
-    ! Thus this routine shouldn't be used for these cases as a simple replacement
-    ! of DLAED4.
-    !
-    ! The arguments are the same as in DLAED4 (with the exception of the INFO argument):
-    !
-    !
-    !  N      (input) INTEGER
-    !         The length of all arrays.
-    !
-    !  I      (input) INTEGER
-    !         The index of the eigenvalue to be computed.  1 <= I <= N.
-    !
-    !  D      (input) DOUBLE PRECISION array, dimension (N)
-    !         The original eigenvalues.  It is assumed that they are in
-    !         order, D(I) < D(J)  for I < J.
-    !
-    !  Z      (input) DOUBLE PRECISION array, dimension (N)
-    !         The components of the updating vector.
-    !
-    !  DELTA  (output) DOUBLE PRECISION array, dimension (N)
-    !         DELTA contains (D(j) - lambda_I) in its  j-th component.
-    !         See remark above about DLAED4 compatibility!
-    !
-    !  RHO    (input) DOUBLE PRECISION
-    !         The scalar in the symmetric updating formula.
-    !
-    !  DLAM   (output) DOUBLE PRECISION
-    !         The computed lambda_I, the I-th updated eigenvalue.
-    !-------------------------------------------------------------------------------
-
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)   :: n, i
-      real(kind=rk)      :: d(n), z(n), delta(n), rho, dlam
-
-      integer(kind=ik)   :: iter
-      real(kind=rk)      :: a, b, x, y, dshift
-
-      ! In order to obtain sufficient numerical accuracy we have to shift the problem
-      ! either by d(i) or d(i+1), whichever is closer to the solution
-
-      ! Upper and lower bound of the shifted solution interval are a and b
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call  timer%start("solve_secular_equation")
-#endif
-      if (i==n) then
-
-       ! Special case: Last eigenvalue
-       ! We shift always by d(n), lower bound is d(n),
-       ! upper bound is determined by a guess:
-
-       dshift = d(n)
-       delta(:) = d(:) - dshift
-
-       a = 0. ! delta(n)
-       b = rho*SUM(z(:)**2) + 1. ! rho*SUM(z(:)**2) is the lower bound for the guess
-
-      else
-
-        ! Other eigenvalues: lower bound is d(i), upper bound is d(i+1)
-        ! We check the sign of the function in the midpoint of the interval
-        ! in order to determine if eigenvalue is more close to d(i) or d(i+1)
-
-        x = 0.5*(d(i)+d(i+1))
-        y = 1. + rho*SUM(z(:)**2/(d(:)-x))
-
-        if (y>0) then
-          ! solution is next to d(i)
-          dshift = d(i)
-        else
-          ! solution is next to d(i+1)
-          dshift = d(i+1)
-        endif
-
-        delta(:) = d(:) - dshift
-        a = delta(i)
-        b = delta(i+1)
-
-      endif
-
-      ! Bisection:
-
-      do iter=1,200
-
-        ! Interval subdivision
-
-        x = 0.5*(a+b)
-
-        if (x==a .or. x==b) exit   ! No further interval subdivisions possible
-        if (abs(x) < 1.d-200) exit ! x next to pole
-
-        ! evaluate value at x
-
-        y = 1. + rho*SUM(z(:)**2/(delta(:)-x))
-
-        if (y==0) then
-          ! found exact solution
-          exit
-        elseif (y>0) then
-          b = x
-        else
-          a = x
-        endif
-
-      enddo
-
-      ! Solution:
-
-      dlam = x + dshift
-      delta(:) = delta(:) - x
-#ifdef HAVE_DETAILED_TIMINGS
-      call  timer%stop("solve_secular_equation")
-#endif
-
-    end subroutine solve_secular_equation
-
-    !-------------------------------------------------------------------------------
-
-    integer function local_index(idx, my_proc, num_procs, nblk, iflag)
-
-    !-------------------------------------------------------------------------------
-    !  local_index: returns the local index for a given global index
-    !               If the global index has no local index on the
-    !               processor my_proc behaviour is defined by iflag
-    !
-    !  Parameters
-    !
-    !  idx         Global index
-    !
-    !  my_proc     Processor row/column for which to calculate the local index
-    !
-    !  num_procs   Total number of processors along row/column
-    !
-    !  nblk        Blocksize
-    !
-    !  iflag       Controls the behaviour if idx is not on local processor
-    !              iflag< 0 : Return last local index before that row/col
-    !              iflag==0 : Return 0
-    !              iflag> 0 : Return next local index after that row/col
-    !-------------------------------------------------------------------------------
-      use precision
-      implicit none
-
-      integer(kind=ik) :: idx, my_proc, num_procs, nblk, iflag
-
-      integer(kind=ik) :: iblk
-
-      iblk = (idx-1)/nblk  ! global block number, 0 based
-
-      if (mod(iblk,num_procs) == my_proc) then
-
-        ! block is local, always return local row/col number
-
-        local_index = (iblk/num_procs)*nblk + mod(idx-1,nblk) + 1
-
-      else
-
-        ! non local block
-
-        if (iflag == 0) then
-
-          local_index = 0
-
-        else
-
-          local_index = (iblk/num_procs)*nblk
-
-          if (mod(iblk,num_procs) > my_proc) local_index = local_index + nblk
-
-          if (iflag>0) local_index = local_index + 1
-        endif
-      endif
-
-    end function local_index
-
-    subroutine cholesky_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success)
-
-    !-------------------------------------------------------------------------------
-    !  cholesky_real: Cholesky factorization of a real symmetric matrix
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix
-    !
-    !  a(lda,matrixCols)    Distributed matrix which should be factorized.
-    !              Distribution is like in Scalapack.
-    !              Only upper triangle is needs to be set.
-    !              On return, the upper triangle contains the Cholesky factor
-    !              and the lower triangle is set to 0.
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)              :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-      real(kind=rk)                 :: a(lda,matrixCols)
-      ! was
-      ! real a(lda, *)
-
-      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)              :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
-      integer(kind=ik)              :: n, nc, i, info
-      integer(kind=ik)              :: lcs, lce, lrs, lre
-      integer(kind=ik)              :: tile_size, l_rows_tile, l_cols_tile
-
-      real(kind=rk), allocatable    :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:)
-
-      logical, intent(in)           :: wantDebug
-      logical, intent(out)          :: success
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("cholesky_real")
-#endif
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-      success = .true.
-
-      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
-
-      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
-      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
-
-      l_rows_tile = tile_size/np_rows ! local rows of a tile
-      l_cols_tile = tile_size/np_cols ! local cols of a tile
-
-      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
-      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
-
-      allocate(tmp1(nblk*nblk))
-      allocate(tmp2(nblk,nblk))
-      tmp1 = 0
-      tmp2 = 0
-
-      allocate(tmatr(l_rows,nblk))
-      allocate(tmatc(l_cols,nblk))
-      tmatr = 0
-      tmatc = 0
-
-      do n = 1, na, nblk
-
-        ! Calculate first local row and column of the still remaining matrix
-        ! on the local processor
-
-        l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
-        l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)
-
-        l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1)
-        l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1)
-
-        if (n+nblk > na) then
-
-          ! This is the last step, just do a Cholesky-Factorization
-          ! of the remaining block
-
-          if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then
-
-            call dpotrf('U',na-n+1,a(l_row1,l_col1),lda,info)
-            if (info/=0) then
-              if (wantDebug) write(error_unit,*) "ELPA1_cholesky_real: Error in dpotrf"
-              success = .false.
-              return
-            endif
-
-          endif
-
-          exit ! Loop
-
-        endif
-
-        if (my_prow==prow(n, nblk, np_rows)) then
-
-          if (my_pcol==pcol(n, nblk, np_cols)) then
-
-            ! The process owning the upper left remaining block does the
-            ! Cholesky-Factorization of this block
-
-            call dpotrf('U',nblk,a(l_row1,l_col1),lda,info)
-            if (info/=0) then
-              if (wantDebug) write(error_unit,*) "ELPA1_cholesky_real: Error in dpotrf"
-              success = .false.
-              return
-            endif
-
-            nc = 0
-            do i=1,nblk
-              tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
-              nc = nc+i
-            enddo
-          endif
-
-          call MPI_Bcast(tmp1,nblk*(nblk+1)/2,MPI_REAL8,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr)
-
-          nc = 0
-          do i=1,nblk
-            tmp2(1:i,i) = tmp1(nc+1:nc+i)
-            nc = nc+i
-          enddo
-
-          if (l_cols-l_colx+1>0) &
-              call dtrsm('L','U','T','N',nblk,l_cols-l_colx+1,1.d0,tmp2,ubound(tmp2,dim=1),a(l_row1,l_colx),lda)
-
-        endif
-
-        do i=1,nblk
-
-          if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = a(l_row1+i-1,l_colx:l_cols)
-          if (l_cols-l_colx+1>0) &
-              call MPI_Bcast(tmatc(l_colx,i),l_cols-l_colx+1,MPI_REAL8,prow(n, nblk, np_rows),mpi_comm_rows,mpierr)
-
-        enddo
-        ! this has to be checked since it was changed substantially when doing type safe
-        call elpa_transpose_vectors_real  (tmatc, ubound(tmatc,dim=1), mpi_comm_cols, &
-                                      tmatr, ubound(tmatr,dim=1), mpi_comm_rows, &
-                                      n, na, nblk, nblk)
-
-        do i=0,(na-1)/tile_size
-          lcs = max(l_colx,i*l_cols_tile+1)
-          lce = min(l_cols,(i+1)*l_cols_tile)
-          lrs = l_rowx
-          lre = min(l_rows,(i+1)*l_rows_tile)
-          if (lce<lcs .or. lre<lrs) cycle
-          call DGEMM('N','T',lre-lrs+1,lce-lcs+1,nblk,-1.d0, &
-                      tmatr(lrs,1),ubound(tmatr,dim=1),tmatc(lcs,1),ubound(tmatc,dim=1), &
-                      1.d0,a(lrs,lcs),lda)
-        enddo
-
-      enddo
-
-      deallocate(tmp1, tmp2, tmatr, tmatc)
-
-      ! Set the lower triangle to 0, it contains garbage (form the above matrix multiplications)
-
-      do i=1,na
-        if (my_pcol==pcol(i, nblk, np_cols)) then
-          ! column i is on local processor
-          l_col1 = local_index(i  , my_pcol, np_cols, nblk, +1) ! local column number
-          l_row1 = local_index(i+1, my_prow, np_rows, nblk, +1) ! first row below diagonal
-          a(l_row1:l_rows,l_col1) = 0
-        endif
-      enddo
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("cholesky_real")
-#endif
-
-    end subroutine cholesky_real
-
-    subroutine invert_trm_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success)
-
-    !-------------------------------------------------------------------------------
-    !  invert_trm_real: Inverts a upper triangular matrix
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix
-    !
-    !  a(lda,matrixCols)    Distributed matrix which should be inverted.
-    !              Distribution is like in Scalapack.
-    !              Only upper triangle is needs to be set.
-    !              The lower triangle is not referenced.
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !-------------------------------------------------------------------------------
-       use precision
-       implicit none
-
-       integer(kind=ik)             :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-       real(kind=rk)                :: a(lda,*)
-#else
-       real(kind=rk)                :: a(lda,matrixCols)
-#endif
-       integer(kind=ik)             :: my_prow, my_pcol, np_rows, np_cols, mpierr
-       integer(kind=ik)             :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
-       integer(kind=ik)             :: n, nc, i, info, ns, nb
-
-       real(kind=rk), allocatable   :: tmp1(:), tmp2(:,:), tmat1(:,:), tmat2(:,:)
-
-       logical, intent(in)          :: wantDebug
-       logical, intent(out)         :: success
-
-       call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-       call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-       call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-       call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-       success = .true.
-
-       l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
-       l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
-
-       allocate(tmp1(nblk*nblk))
-       allocate(tmp2(nblk,nblk))
-       tmp1 = 0
-       tmp2 = 0
-
-       allocate(tmat1(l_rows,nblk))
-       allocate(tmat2(nblk,l_cols))
-       tmat1 = 0
-       tmat2 = 0
-
-
-       ns = ((na-1)/nblk)*nblk + 1
-
-       do n = ns,1,-nblk
-
-         l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
-         l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)
-
-         nb = nblk
-         if (na-n+1 < nblk) nb = na-n+1
-
-         l_rowx = local_index(n+nb, my_prow, np_rows, nblk, +1)
-         l_colx = local_index(n+nb, my_pcol, np_cols, nblk, +1)
-
-         if (my_prow==prow(n, nblk, np_rows)) then
-
-           if (my_pcol==pcol(n, nblk, np_cols)) then
-
-             call DTRTRI('U','N',nb,a(l_row1,l_col1),lda,info)
-             if (info/=0) then
-               if (wantDebug) write(error_unit,*) "ELPA1_invert_trm_real: Error in DTRTRI"
-               success = .false.
-               return
-             endif
-
-             nc = 0
-             do i=1,nb
-               tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
-               nc = nc+i
-             enddo
-           endif
-
-           call MPI_Bcast(tmp1,nb*(nb+1)/2,MPI_REAL8,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr)
-
-           nc = 0
-           do i=1,nb
-             tmp2(1:i,i) = tmp1(nc+1:nc+i)
-             nc = nc+i
-           enddo
-
-           if (l_cols-l_colx+1>0) &
-               call DTRMM('L','U','N','N',nb,l_cols-l_colx+1,1.d0,tmp2,ubound(tmp2,dim=1),a(l_row1,l_colx),lda)
-
-           if (l_colx<=l_cols)   tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols)
-           if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0
-
-         endif
-
-         if (l_row1>1) then
-           if (my_pcol==pcol(n, nblk, np_cols)) then
-             tmat1(1:l_row1-1,1:nb) = a(1:l_row1-1,l_col1:l_col1+nb-1)
-             a(1:l_row1-1,l_col1:l_col1+nb-1) = 0
-           endif
-
-           do i=1,nb
-             call MPI_Bcast(tmat1(1,i),l_row1-1,MPI_REAL8,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr)
-           enddo
-         endif
-
-         if (l_cols-l_col1+1>0) &
-            call MPI_Bcast(tmat2(1,l_col1),(l_cols-l_col1+1)*nblk,MPI_REAL8,prow(n, nblk, np_rows),mpi_comm_rows,mpierr)
-
-         if (l_row1>1 .and. l_cols-l_col1+1>0) &
-            call dgemm('N','N',l_row1-1,l_cols-l_col1+1,nb, -1.d0, &
-                       tmat1,ubound(tmat1,dim=1),tmat2(1,l_col1),ubound(tmat2,dim=1), &
-                       1.d0, a(1,l_col1),lda)
-
-       enddo
-
-       deallocate(tmp1, tmp2, tmat1, tmat2)
-
-    end subroutine invert_trm_real
-
-    subroutine cholesky_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success)
-
-    !-------------------------------------------------------------------------------
-    !  cholesky_complex: Cholesky factorization of a complex hermitian matrix
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix
-    !
-    !  a(lda,matriCols)    Distributed matrix which should be factorized.
-    !              Distribution is like in Scalapack.
-    !              Only upper triangle is needs to be set.
-    !              On return, the upper triangle contains the Cholesky factor
-    !              and the lower triangle is set to 0.
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)                 :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      complex(kind=ck)                 :: a(lda,*)
-#else
-      complex(kind=ck)                 :: a(lda,matrixCols)
-#endif
-      integer(kind=ik)                 :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)                 :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
-      integer(kind=ik)                 :: n, nc, i, info
-      integer(kind=ik)                 :: lcs, lce, lrs, lre
-      integer(kind=ik)                 :: tile_size, l_rows_tile, l_cols_tile
-
-      complex(kind=ck), allocatable    :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:)
-
-      logical, intent(in)              :: wantDebug
-      logical, intent(out)             :: success
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("cholesky_complex")
-#endif
-      success = .true.
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
-
-      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
-      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
-
-      l_rows_tile = tile_size/np_rows ! local rows of a tile
-      l_cols_tile = tile_size/np_cols ! local cols of a tile
-
-      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
-      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
-
-      allocate(tmp1(nblk*nblk))
-      allocate(tmp2(nblk,nblk))
-      tmp1 = 0
-      tmp2 = 0
-
-      allocate(tmatr(l_rows,nblk))
-      allocate(tmatc(l_cols,nblk))
-      tmatr = 0
-      tmatc = 0
-
-      do n = 1, na, nblk
-
-        ! Calculate first local row and column of the still remaining matrix
-        ! on the local processor
-
-        l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
-        l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)
-
-        l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1)
-        l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1)
-
-        if (n+nblk > na) then
-
-          ! This is the last step, just do a Cholesky-Factorization
-          ! of the remaining block
-
-          if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then
-
-            call zpotrf('U',na-n+1,a(l_row1,l_col1),lda,info)
-            if (info/=0) then
-              if (wantDebug) write(error_unit,*) "ELPA1_cholesky_complex: Error in zpotrf"
-              success = .false.
-              return
-            endif
-
-          endif
-
-          exit ! Loop
-        endif
-
-        if (my_prow==prow(n, nblk, np_rows)) then
-
-          if (my_pcol==pcol(n, nblk, np_cols)) then
-
-            ! The process owning the upper left remaining block does the
-            ! Cholesky-Factorization of this block
-
-            call zpotrf('U',nblk,a(l_row1,l_col1),lda,info)
-            if (info/=0) then
-              if (wantDebug) write(error_unit,*) "ELPA1_cholesky_complex: Error in zpotrf"
-              success = .false.
-              return
-            endif
-
-            nc = 0
-            do i=1,nblk
-              tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
-              nc = nc+i
-            enddo
-          endif
-
-          call MPI_Bcast(tmp1,nblk*(nblk+1)/2,MPI_DOUBLE_COMPLEX,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr)
-
-          nc = 0
-          do i=1,nblk
-            tmp2(1:i,i) = tmp1(nc+1:nc+i)
-            nc = nc+i
-          enddo
-
-          if (l_cols-l_colx+1>0) &
-                call ztrsm('L','U','C','N',nblk,l_cols-l_colx+1,(1.d0,0.d0),tmp2,ubound(tmp2,dim=1),a(l_row1,l_colx),lda)
-
-        endif
-
-        do i=1,nblk
-
-          if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = conjg(a(l_row1+i-1,l_colx:l_cols))
-          if (l_cols-l_colx+1>0) &
-                call MPI_Bcast(tmatc(l_colx,i),l_cols-l_colx+1,MPI_DOUBLE_COMPLEX,prow(n, nblk, np_rows),mpi_comm_rows,mpierr)
-
-        enddo
-        ! this has to be checked since it was changed substantially when doing type safe
-        call elpa_transpose_vectors_complex  (tmatc, ubound(tmatc,dim=1), mpi_comm_cols, &
-                                        tmatr, ubound(tmatr,dim=1), mpi_comm_rows, &
-                                        n, na, nblk, nblk)
-        do i=0,(na-1)/tile_size
-          lcs = max(l_colx,i*l_cols_tile+1)
-          lce = min(l_cols,(i+1)*l_cols_tile)
-          lrs = l_rowx
-          lre = min(l_rows,(i+1)*l_rows_tile)
-          if (lce<lcs .or. lre<lrs) cycle
-          call ZGEMM('N','C',lre-lrs+1,lce-lcs+1,nblk,(-1.d0,0.d0), &
-                        tmatr(lrs,1),ubound(tmatr,dim=1),tmatc(lcs,1),ubound(tmatc,dim=1), &
-                        (1.d0,0.d0),a(lrs,lcs),lda)
-        enddo
-
-      enddo
-
-      deallocate(tmp1, tmp2, tmatr, tmatc)
-
-      ! Set the lower triangle to 0, it contains garbage (form the above matrix multiplications)
-
-      do i=1,na
-        if (my_pcol==pcol(i, nblk, np_cols)) then
-          ! column i is on local processor
-          l_col1 = local_index(i  , my_pcol, np_cols, nblk, +1) ! local column number
-          l_row1 = local_index(i+1, my_prow, np_rows, nblk, +1) ! first row below diagonal
-          a(l_row1:l_rows,l_col1) = 0
-        endif
-      enddo
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("cholesky_complex")
-#endif
-
-    end subroutine cholesky_complex
-
-    subroutine invert_trm_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success)
-
-    !-------------------------------------------------------------------------------
-    !  invert_trm_complex: Inverts a upper triangular matrix
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix
-    !
-    !  a(lda,matrixCols)    Distributed matrix which should be inverted.
-    !              Distribution is like in Scalapack.
-    !              Only upper triangle is needs to be set.
-    !              The lower triangle is not referenced.
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !-------------------------------------------------------------------------------
-       use precision
-       implicit none
-
-       integer(kind=ik)                 :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-       complex(kind=ck)                 :: a(lda,*)
-#else
-       complex(kind=ck)                 :: a(lda,matrixCols)
-#endif
-       integer(kind=ik)                 :: my_prow, my_pcol, np_rows, np_cols, mpierr
-       integer(kind=ik)                 :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
-       integer(kind=ik)                 :: n, nc, i, info, ns, nb
-
-       complex(kind=ck), allocatable    :: tmp1(:), tmp2(:,:), tmat1(:,:), tmat2(:,:)
-
-       logical, intent(in)              :: wantDebug
-       logical, intent(out)             :: success
-
-       call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-       call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-       call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-       call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-       success = .true.
-
-       l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
-       l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
-
-       allocate(tmp1(nblk*nblk))
-       allocate(tmp2(nblk,nblk))
-       tmp1 = 0
-       tmp2 = 0
-
-       allocate(tmat1(l_rows,nblk))
-       allocate(tmat2(nblk,l_cols))
-       tmat1 = 0
-       tmat2 = 0
-
-       ns = ((na-1)/nblk)*nblk + 1
-
-       do n = ns,1,-nblk
-
-         l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
-         l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)
-
-         nb = nblk
-         if (na-n+1 < nblk) nb = na-n+1
-
-         l_rowx = local_index(n+nb, my_prow, np_rows, nblk, +1)
-         l_colx = local_index(n+nb, my_pcol, np_cols, nblk, +1)
-
-         if (my_prow==prow(n, nblk, np_rows)) then
-
-           if (my_pcol==pcol(n, nblk, np_cols)) then
-
-             call ZTRTRI('U','N',nb,a(l_row1,l_col1),lda,info)
-             if (info/=0) then
-               if (wantDebug) write(error_unit,*) "ELPA1_invert_trm_complex: Error in ZTRTRI"
-               success = .false.
-               return
-             endif
-
-             nc = 0
-             do i=1,nb
-               tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
-               nc = nc+i
-             enddo
-           endif
-
-           call MPI_Bcast(tmp1,nb*(nb+1)/2,MPI_DOUBLE_COMPLEX,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr)
-
-           nc = 0
-           do i=1,nb
-             tmp2(1:i,i) = tmp1(nc+1:nc+i)
-             nc = nc+i
-           enddo
-
-           if (l_cols-l_colx+1>0) &
-             call ZTRMM('L','U','N','N',nb,l_cols-l_colx+1,(1.d0,0.d0),tmp2,ubound(tmp2,dim=1),a(l_row1,l_colx),lda)
-
-           if (l_colx<=l_cols)   tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols)
-           if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0
-
-         endif
-
-         if (l_row1>1) then
-           if (my_pcol==pcol(n, nblk, np_cols)) then
-             tmat1(1:l_row1-1,1:nb) = a(1:l_row1-1,l_col1:l_col1+nb-1)
-             a(1:l_row1-1,l_col1:l_col1+nb-1) = 0
-           endif
-
-           do i=1,nb
-             call MPI_Bcast(tmat1(1,i),l_row1-1,MPI_DOUBLE_COMPLEX,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr)
-           enddo
-         endif
-
-         if (l_cols-l_col1+1>0) &
-           call MPI_Bcast(tmat2(1,l_col1),(l_cols-l_col1+1)*nblk,MPI_DOUBLE_COMPLEX,prow(n, nblk, np_rows),mpi_comm_rows,mpierr)
-
-         if (l_row1>1 .and. l_cols-l_col1+1>0) &
-           call ZGEMM('N','N',l_row1-1,l_cols-l_col1+1,nb, (-1.d0,0.d0), &
-                        tmat1,ubound(tmat1,dim=1),tmat2(1,l_col1),ubound(tmat2,dim=1), &
-                        (1.d0,0.d0), a(1,l_col1),lda)
-
-       enddo
-
-       deallocate(tmp1, tmp2, tmat1, tmat2)
-
-    end subroutine invert_trm_complex
-
-    integer function least_common_multiple(a, b)
-
-      ! Returns the least common multiple of a and b
-      ! There may be more efficient ways to do this, we use the most simple approach
-      use precision
-      implicit none
-      integer(kind=ik), intent(in) :: a, b
-
-      do least_common_multiple = a, a*(b-1), a
-        if(mod(least_common_multiple,b)==0) exit
-      enddo
-      ! if the loop is left regularly, least_common_multiple = a*b
-
-    end function
-
-    subroutine hh_transform_real(alpha, xnorm_sq, xf, tau)
-      ! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:)
-      ! and returns the factor xf by which x has to be scaled.
-      ! It also hasn't the special handling for numbers < 1.d-300 or > 1.d150
-      ! since this would be expensive for the parallel implementation.
-      use precision
-      implicit none
-      real(kind=rk), intent(inout) :: alpha
-      real(kind=rk), intent(in)    :: xnorm_sq
-      real(kind=rk), intent(out)   :: xf, tau
-
-      real(kind=rk)                :: BETA
-
-      if ( XNORM_SQ==0. ) then
-
-        if ( ALPHA>=0. ) then
-          TAU = 0.
-        else
-          TAU = 2.
-          ALPHA = -ALPHA
-        endif
-        XF = 0.
-
-      else
-
-        BETA = SIGN( SQRT( ALPHA**2 + XNORM_SQ ), ALPHA )
-        ALPHA = ALPHA + BETA
-        IF ( BETA<0 ) THEN
-          BETA = -BETA
-          TAU = -ALPHA / BETA
-        ELSE
-          ALPHA = XNORM_SQ / ALPHA
-          TAU = ALPHA / BETA
-          ALPHA = -ALPHA
-       END IF
-       XF = 1./ALPHA
-       ALPHA = BETA
-     endif
-
-    end subroutine
-
-    subroutine hh_transform_complex(alpha, xnorm_sq, xf, tau)
-
-      ! Similar to LAPACK routine ZLARFP, but uses ||x||**2 instead of x(:)
-      ! and returns the factor xf by which x has to be scaled.
-      ! It also hasn't the special handling for numbers < 1.d-300 or > 1.d150
-      ! since this would be expensive for the parallel implementation.
-      use precision
-      implicit none
-      complex(kind=ck), intent(inout) :: alpha
-      real(kind=rk), intent(in)       :: xnorm_sq
-      complex(kind=ck), intent(out)   :: xf, tau
-
-      real*8 ALPHR, ALPHI, BETA
-
-      ALPHR = DBLE( ALPHA )
-      ALPHI = DIMAG( ALPHA )
-
-      if ( XNORM_SQ==0. .AND. ALPHI==0. ) then
-
-        if ( ALPHR>=0. ) then
-          TAU = 0.
-        else
-          TAU = 2.
-          ALPHA = -ALPHA
-        endif
-        XF = 0.
-
-      else
-
-        BETA = SIGN( SQRT( ALPHR**2 + ALPHI**2 + XNORM_SQ ), ALPHR )
-        ALPHA = ALPHA + BETA
-        IF ( BETA<0 ) THEN
-          BETA = -BETA
-          TAU = -ALPHA / BETA
-        ELSE
-          ALPHR = ALPHI * (ALPHI/DBLE( ALPHA ))
-          ALPHR = ALPHR + XNORM_SQ/DBLE( ALPHA )
-          TAU = DCMPLX( ALPHR/BETA, -ALPHI/BETA )
-          ALPHA = DCMPLX( -ALPHR, ALPHI )
-        END IF
-        XF = 1./ALPHA
-        ALPHA = BETA
-      endif
-
-    end subroutine
-
-end module ELPA1_compute
-
diff --git a/src/elpa2.F90 b/src/elpa2.F90
deleted file mode 100644
index 9ee82e7be..000000000
--- a/src/elpa2.F90
+++ /dev/null
@@ -1,540 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!    This particular source code file contains additions, changes and
-!    enhancements authored by Intel Corporation which is not part of
-!    the ELPA consortium.
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-
-
-! ELPA2 -- 2-stage solver for ELPA
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-
-#include "config-f90.h"
-!> \brief Fortran module which provides the routines to use the two-stage ELPA solver
-module ELPA2
-
-! Version 1.1.2, 2011-02-21
-
-  use elpa_utilities
-  use elpa1_compute
-  use elpa1, only : elpa_print_times, time_evp_back, time_evp_fwd, time_evp_solve
-  use elpa2_utilities
-  use elpa2_compute
-  use elpa_pdgeqrf
-
-  implicit none
-
-  PRIVATE ! By default, all routines contained are private
-
-  ! The following routines are public:
-
-  public :: solve_evp_real_2stage
-  public :: solve_evp_complex_2stage
-
-  include 'mpif.h'
-
-!******
-contains
-!-------------------------------------------------------------------------------
-!>  \brief solve_evp_real_2stage: Fortran function to solve the real eigenvalue problem with a 2 stage approach
-!>
-!>  Parameters
-!>
-!>  \param na                                   Order of matrix a
-!>
-!>  \param nev                                  Number of eigenvalues needed
-!>
-!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
-!>                                              Distribution is like in Scalapack.
-!>                                              The full matrix must be set (not only one half like in scalapack).
-!>                                              Destroyed on exit (upper and lower half).
-!>
-!>  \param lda                                  Leading dimension of a
-!>
-!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
-!>
-!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
-!>                                              Distribution is like in Scalapack.
-!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
-!>                                              even if only a part of the eigenvalues is needed.
-!>
-!>  \param ldq                                  Leading dimension of q
-!>
-!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
-!>
-!>  \param matrixCols                           local columns of matrix a and q
-!>
-!>  \param mpi_comm_rows                        MPI communicator for rows
-!>  \param mpi_comm_cols                        MPI communicator for columns
-!>  \param mpi_comm_all                         MPI communicator for the total processor set
-!>
-!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
-!>
-!>  \param use_qr (optional)                    use QR decomposition
-!>
-!>  \result success                             logical, false if error occured
-!-------------------------------------------------------------------------------
-
-function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk,        &
-                               matrixCols,                               &
-                                 mpi_comm_rows, mpi_comm_cols,           &
-                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
-                                 useQR) result(success)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   use timings
-#endif
-   use precision
-   implicit none
-   logical, intent(in), optional          :: useQR
-   logical                                :: useQRActual, useQREnvironment
-   integer(kind=ik), intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
-   integer(kind=ik)                       :: THIS_REAL_ELPA_KERNEL
-
-   integer(kind=ik), intent(in)           :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
-                                             mpi_comm_cols, mpi_comm_all
-   integer(kind=ik), intent(in)           :: nblk
-   real(kind=rk), intent(inout)           :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
-   ! was
-   ! real a(lda,*), q(ldq,*)
-   real(kind=rk), allocatable             :: hh_trans_real(:,:)
-
-   integer(kind=ik)                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
-   integer(kind=ik)                       :: nbw, num_blocks
-   real(kind=rk), allocatable             :: tmat(:,:,:), e(:)
-   real(kind=rk)                          :: ttt0, ttt1, ttts
-   integer(kind=ik)                       :: i
-   logical                                :: success
-   logical, save                          :: firstCall = .true.
-   logical                                :: wantDebug
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("solve_evp_real_2stage")
-#endif
-   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
-   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
-
-   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-
-   wantDebug = .false.
-   if (firstCall) then
-     ! are debug messages desired?
-     wantDebug = debug_messages_via_environment_variable()
-     firstCall = .false.
-   endif
-
-   success = .true.
-
-   useQRActual = .false.
-
-   ! set usage of qr decomposition via API call
-   if (present(useQR)) then
-     if (useQR) useQRActual = .true.
-     if (.not.(useQR)) useQRACtual = .false.
-   endif
-
-   ! overwrite this with environment variable settings
-   if (qr_decomposition_via_environment_variable(useQREnvironment)) then
-     useQRActual = useQREnvironment
-   endif
-
-   if (useQRActual) then
-     if (mod(na,nblk) .ne. 0) then
-       if (wantDebug) then
-         write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
-       endif
-     print *, "Do not use QR-decomposition for this matrix and blocksize."
-     success = .false.
-     return
-     endif
-   endif
-
-
-   if (present(THIS_REAL_ELPA_KERNEL_API)) then
-     ! user defined kernel via the optional argument in the API call
-     THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
-   else
-
-     ! if kernel is not choosen via api
-     ! check whether set by environment variable
-     THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
-   endif
-
-   ! check whether choosen kernel is allowed
-   if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
-
-     if (my_pe == 0) then
-       write(error_unit,*) " "
-       write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
-       write(error_unit,*) "is not in the list of the allowed kernels!"
-       write(error_unit,*) " "
-       write(error_unit,*) "Allowed kernels are:"
-       do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
-         if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
-           write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
-         endif
-       enddo
-
-       write(error_unit,*) " "
-       write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
-     endif
-     THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
-
-   endif
-
-   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
-   ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
-   ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
-   ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
-   ! on this and maybe allow a run-time optimization here
-   nbw = (63/nblk+1)*nblk
-
-   num_blocks = (na-1)/nbw + 1
-
-   allocate(tmat(nbw,nbw,num_blocks))
-
-   ! Reduction full -> band
-
-   ttt0 = MPI_Wtime()
-   ttts = ttt0
-   call bandred_real(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
-                     tmat, wantDebug, success, useQRActual)
-   if (.not.(success)) return
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-      write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
-
-   ! Reduction band -> tridiagonal
-
-   allocate(e(na))
-
-   ttt0 = MPI_Wtime()
-   call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
-                          mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-      write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
-
-   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
-   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
-
-   ttt1 = MPI_Wtime()
-   time_evp_fwd = ttt1-ttts
-
-   ! Solve tridiagonal system
-
-   ttt0 = MPI_Wtime()
-   call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
-                    mpi_comm_cols, wantDebug, success)
-   if (.not.(success)) return
-
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
-   time_evp_solve = ttt1-ttt0
-   ttts = ttt1
-
-   deallocate(e)
-
-   ! Backtransform stage 1
-
-   ttt0 = MPI_Wtime()
-   call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, &
-                                    mpi_comm_rows, mpi_comm_cols, wantDebug, success,      &
-                                    THIS_REAL_ELPA_KERNEL)
-   if (.not.(success)) return
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-      write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
-
-   ! We can now deallocate the stored householder vectors
-   deallocate(hh_trans_real)
-
-   ! Backtransform stage 2
-
-   ttt0 = MPI_Wtime()
-   call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, &
-                                   mpi_comm_cols, useQRActual)
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-      write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
-   time_evp_back = ttt1-ttts
-
-   deallocate(tmat)
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("solve_evp_real_2stage")
-#endif
-1  format(a,f10.3)
-
-end function solve_evp_real_2stage
-
-
-!-------------------------------------------------------------------------------
-!>  \brief solve_evp_complex_2stage: Fortran function to solve the complex eigenvalue problem with a 2 stage approach
-!>
-!>  Parameters
-!>
-!>  \param na                                   Order of matrix a
-!>
-!>  \param nev                                  Number of eigenvalues needed
-!>
-!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
-!>                                              Distribution is like in Scalapack.
-!>                                              The full matrix must be set (not only one half like in scalapack).
-!>                                              Destroyed on exit (upper and lower half).
-!>
-!>  \param lda                                  Leading dimension of a
-!>
-!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
-!>
-!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
-!>                                              Distribution is like in Scalapack.
-!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
-!>                                              even if only a part of the eigenvalues is needed.
-!>
-!>  \param ldq                                  Leading dimension of q
-!>
-!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
-!>
-!>  \param matrixCols                           local columns of matrix a and q
-!>
-!>  \param mpi_comm_rows                        MPI communicator for rows
-!>  \param mpi_comm_cols                        MPI communicator for columns
-!>  \param mpi_comm_all                         MPI communicator for the total processor set
-!>
-!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
-!>
-!>  \result success                             logical, false if error occured
-!-------------------------------------------------------------------------------
-function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
-                                  matrixCols, mpi_comm_rows, mpi_comm_cols,      &
-                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   use timings
-#endif
-   use precision
-   implicit none
-   integer(kind=ik), intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
-   integer(kind=ik)                       :: THIS_COMPLEX_ELPA_KERNEL
-   integer(kind=ik), intent(in)           :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
-   complex(kind=ck), intent(inout)        :: a(lda,matrixCols), q(ldq,matrixCols)
-   ! was
-   ! complex a(lda,*), q(ldq,*)
-   real(kind=rk), intent(inout)           :: ev(na)
-   complex(kind=ck), allocatable          :: hh_trans_complex(:,:)
-
-   integer(kind=ik)                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
-   integer(kind=ik)                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
-   complex(kind=ck), allocatable          :: tmat(:,:,:)
-   real(kind=rk), allocatable             :: q_real(:,:), e(:)
-   real(kind=rk)                          :: ttt0, ttt1, ttts
-   integer(kind=ik)                       :: i
-
-   logical                                :: success, wantDebug
-   logical, save                          :: firstCall = .true.
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("solve_evp_complex_2stage")
-#endif
-   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
-   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
-
-   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-   wantDebug = .false.
-   if (firstCall) then
-     ! are debug messages desired?
-     wantDebug = debug_messages_via_environment_variable()
-     firstCall = .false.
-   endif
-
-
-   success = .true.
-
-   if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
-     ! user defined kernel via the optional argument in the API call
-     THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
-   else
-     ! if kernel is not choosen via api
-     ! check whether set by environment variable
-     THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
-   endif
-
-   ! check whether choosen kernel is allowed
-   if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then
-
-     if (my_pe == 0) then
-       write(error_unit,*) " "
-       write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
-       write(error_unit,*) "is not in the list of the allowed kernels!"
-       write(error_unit,*) " "
-       write(error_unit,*) "Allowed kernels are:"
-       do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
-         if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
-           write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
-         endif
-       enddo
-
-       write(error_unit,*) " "
-       write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
-     endif
-     THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
-   endif
-   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
-
-   nbw = (31/nblk+1)*nblk
-
-   num_blocks = (na-1)/nbw + 1
-
-   allocate(tmat(nbw,nbw,num_blocks))
-
-   ! Reduction full -> band
-
-   ttt0 = MPI_Wtime()
-   ttts = ttt0
-   call bandred_complex(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
-                        tmat, wantDebug, success)
-   if (.not.(success)) then
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop()
-#endif
-     return
-   endif
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-      write(error_unit,*) 'Time bandred_complex               :',ttt1-ttt0
-
-   ! Reduction band -> tridiagonal
-
-   allocate(e(na))
-
-   ttt0 = MPI_Wtime()
-   call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_complex, &
-                             mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-      write(error_unit,*) 'Time tridiag_band_complex          :',ttt1-ttt0
-
-   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
-   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
-
-   ttt1 = MPI_Wtime()
-   time_evp_fwd = ttt1-ttts
-
-   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
-   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
-   l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev
-
-   allocate(q_real(l_rows,l_cols))
-
-   ! Solve tridiagonal system
-
-   ttt0 = MPI_Wtime()
-   call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, &
-                    mpi_comm_rows, mpi_comm_cols, wantDebug, success)
-   if (.not.(success)) return
-
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times)  &
-      write(error_unit,*) 'Time solve_tridi                   :',ttt1-ttt0
-   time_evp_solve = ttt1-ttt0
-   ttts = ttt1
-
-   q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
-
-   deallocate(e, q_real)
-
-   ! Backtransform stage 1
-
-   ttt0 = MPI_Wtime()
-   call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq,   &
-                                       matrixCols, hh_trans_complex, &
-                                       mpi_comm_rows, mpi_comm_cols, &
-                                       wantDebug, success,THIS_COMPLEX_ELPA_KERNEL)
-   if (.not.(success)) return
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-      write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0
-
-   ! We can now deallocate the stored householder vectors
-   deallocate(hh_trans_complex)
-
-   ! Backtransform stage 2
-
-   ttt0 = MPI_Wtime()
-   call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, &
-                                      mpi_comm_rows, mpi_comm_cols)
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-      write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0
-   time_evp_back = ttt1-ttts
-
-   deallocate(tmat)
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("solve_evp_complex_2stage")
-#endif
-
-1  format(a,f10.3)
-
-end function solve_evp_complex_2stage
-
-end module ELPA2
diff --git a/src/elpa2_compute.F90 b/src/elpa2_compute.F90
deleted file mode 100644
index 9dde5dc8d..000000000
--- a/src/elpa2_compute.F90
+++ /dev/null
@@ -1,5303 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!    This particular source code file contains additions, changes and
-!    enhancements authored by Intel Corporation which is not part of
-!    the ELPA consortium.
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-
-
-! ELPA2 -- 2-stage solver for ELPA
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-
-#include "config-f90.h"
-
-module ELPA2_compute
-
-! Version 1.1.2, 2011-02-21
-
-  use elpa_utilities
-  USE ELPA1_compute
-  use elpa1, only : elpa_print_times, time_evp_back, time_evp_fwd, time_evp_solve
-  use elpa2_utilities
-  use elpa_pdgeqrf
-
-  implicit none
-
-  PRIVATE ! By default, all routines contained are private
-
-  public :: bandred_real
-  public :: tridiag_band_real
-  public :: trans_ev_tridi_to_band_real
-  public :: trans_ev_band_to_full_real
-
-  public :: bandred_complex
-  public :: tridiag_band_complex
-  public :: trans_ev_tridi_to_band_complex
-  public :: trans_ev_band_to_full_complex
-
-  public :: band_band_real
-  public :: divide_band
-
-  integer, public :: which_qr_decomposition = 1     ! defines, which QR-decomposition algorithm will be used
-                                                    ! 0 for unblocked
-                                                    ! 1 for blocked (maxrank: nblk)
-  include 'mpif.h'
-
-  contains
-
-    subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
-                            tmat, wantDebug, success, useQR)
-
-    !-------------------------------------------------------------------------------
-    !  bandred_real: Reduces a distributed symmetric matrix to band form
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix
-    !
-    !  a(lda,matrixCols)    Distributed matrix which should be reduced.
-    !              Distribution is like in Scalapack.
-    !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
-    !              a(:,:) is overwritten on exit with the band and the Householder vectors
-    !              in the upper half.
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  nbw         semi bandwith of output matrix
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
-    !              Factors for the Householder vectors (returned), needed for back transformation
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-#ifdef WITH_OPENMP
-      use omp_lib
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)           :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk)              :: a(lda,*), tmat(nbw,nbw,*)
-#else
-      real(kind=rk)              :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
-#endif
-      integer(kind=ik)           :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)           :: l_cols, l_rows, vmrCols
-      integer(kind=ik)           :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
-      integer(kind=ik)           :: istep, ncol, lch, lcx, nlc, mynlc
-      integer(kind=ik)           :: tile_size, l_rows_tile, l_cols_tile
-
-      real(kind=rk)              :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
-
-      real(kind=rk), allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
-
-      ! needed for blocked QR decomposition
-      integer(kind=ik)           :: PQRPARAM(11), work_size
-      real(kind=rk)              :: dwork_size(1)
-      real(kind=rk), allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)
-
-      logical, intent(in)        :: wantDebug
-      logical, intent(out)       :: success
-
-      logical, intent(in)        :: useQR
-
-      integer(kind=ik)           :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, ii, pp, transformChunkSize
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("bandred_real")
-#endif
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-      success = .true.
-
-
-      ! Semibandwith nbw must be a multiple of blocksize nblk
-      if (mod(nbw,nblk)/=0) then
-        if (my_prow==0 .and. my_pcol==0) then
-          if (wantDebug) then
-            write(error_unit,*) 'ELPA2_bandred_real: ERROR: nbw=',nbw,', nblk=',nblk
-            write(error_unit,*) 'ELPA2_bandred_real: ELPA2 works only for nbw==n*nblk'
-          endif
-          success = .false.
-          return
-        endif
-      endif
-
-      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
-
-      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
-      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
-
-      l_rows_tile = tile_size/np_rows ! local rows of a tile
-      l_cols_tile = tile_size/np_cols ! local cols of a tile
-
-      if (useQR) then
-        if (which_qr_decomposition == 1) then
-          call qr_pqrparam_init(pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
-          allocate(tauvector(na))
-          allocate(blockheuristic(nblk))
-          l_rows = local_index(na, my_prow, np_rows, nblk, -1)
-          allocate(vmr(max(l_rows,1),na))
-
-          vmrCols = na
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-          call qr_pdgeqrf_2dcomm(a, lda, matrixCols, vmr, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
-                                 nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
-                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
-
-#else
-          call qr_pdgeqrf_2dcomm(a(1:lda,1:matrixCols), matrixCols, lda, vmr(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
-                                 vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
-                                 nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
-                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
-#endif
-          work_size = dwork_size(1)
-          allocate(work_blocked(work_size))
-
-          work_blocked = 0.0d0
-          deallocate(vmr)
-        endif
-      endif
-
-      do istep = (na-1)/nbw, 1, -1
-
-        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
-
-        ! Number of local columns/rows of remaining matrix
-        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
-        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
-
-        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
-
-        allocate(vmr(max(l_rows,1),2*n_cols))
-        allocate(umc(max(l_cols,1),2*n_cols))
-
-        allocate(vr(l_rows+1))
-
-        vmr(1:l_rows,1:n_cols) = 0.
-        vr(:) = 0
-        tmat(:,:,istep) = 0
-
-        ! Reduce current block to lower triangular form
-
-        if (useQR) then
-          if (which_qr_decomposition == 1) then
-
-            vmrCols = 2*n_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-            call qr_pdgeqrf_2dcomm(a, lda, matrixCols, vmr, max(l_rows,1), vmrCols, tauvector(1), &
-                                   na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
-                                     work_size, na, n_cols, nblk, nblk,        &
-                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
-                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
-                                     blockheuristic)
-
-#else
-            call qr_pdgeqrf_2dcomm(a(1:lda,1:matrixCols), lda, matrixCols, vmr(1:max(l_rows,1),1:vmrCols) ,   &
-                                    max(l_rows,1), vmrCols, tauvector(1:na), na, &
-                                     tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
-                                     work_size, na, n_cols, nblk, nblk,        &
-                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
-                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
-                                     blockheuristic)
-#endif
-          endif
-        else
-
-          do lc = n_cols, 1, -1
-
-            ncol = istep*nbw + lc ! absolute column number of householder vector
-            nrow = ncol - nbw ! Absolute number of pivot row
-
-            lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
-            lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
-
-            tau = 0
-
-            if (nrow == 1) exit ! Nothing to do
-
-            cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
-
-            if (my_pcol==cur_pcol) then
-
-              ! Get vector to be transformed; distribute last element and norm of
-              ! remaining elements to all procs in current column
-
-              vr(1:lr) = a(1:lr,lch) ! vector to be transformed
-
-              if (my_prow==prow(nrow, nblk, np_rows)) then
-                aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
-                aux1(2) = vr(lr)
-              else
-                aux1(1) = dot_product(vr(1:lr),vr(1:lr))
-                aux1(2) = 0.
-              endif
-
-              call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-
-              vnorm2 = aux2(1)
-              vrl    = aux2(2)
-
-              ! Householder transformation
-
-              call hh_transform_real(vrl, vnorm2, xf, tau)
-
-              ! Scale vr and store Householder vector for back transformation
-
-              vr(1:lr) = vr(1:lr) * xf
-              if (my_prow==prow(nrow, nblk, np_rows)) then
-                a(1:lr-1,lch) = vr(1:lr-1)
-                a(lr,lch) = vrl
-                vr(lr) = 1.
-              else
-                a(1:lr,lch) = vr(1:lr)
-              endif
-
-            endif
-
-            ! Broadcast Householder vector and tau along columns
-
-            vr(lr+1) = tau
-            call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
-            vmr(1:lr,lc) = vr(1:lr)
-            tau = vr(lr+1)
-            tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
-
-            ! Transform remaining columns in current block with Householder vector
-            ! Local dot product
-
-            aux1 = 0
-#ifdef WITH_OPENMP
-            !Open up one omp region to avoid paying openmp overhead.
-            !This does not help performance due to the addition of two openmp barriers around the MPI call,
-            !But in the future this may be beneficial if these barriers are replaced with a faster implementation
-
-            !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
-            mynlc = 0 ! number of local columns
-
-            !This loop does not have independent iterations,
-            !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
-            !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
-            !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
-            !the iteration number modulo the number of threads
-            do j=1,lc-1
-              lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
-              if (lcx>0 ) then
-                mynlc = mynlc+1
-                if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
-                    if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
-                endif
-              endif
-            enddo
-
-            ! Get global dot products
-            !$omp barrier
-            !$omp single
-            if (mynlc>0) call mpi_allreduce(aux1,aux2,mynlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-            !$omp end single
-            !$omp barrier
-
-            ! Transform
-            transformChunkSize=32
-            mynlc = 0
-            do j=1,lc-1
-              lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
-              if (lcx>0) then
-                mynlc = mynlc+1
-                !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
-                !However, for some reason this is slower than doing it manually, so it is parallelized as below.
-                do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
-                   do pp = 1,transformChunkSize
-                       if (pp + ii > lr) exit
-                           a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
-                   enddo
-                enddo
-              endif
-            enddo
-            !$omp end parallel
-#else
-            nlc = 0 ! number of local columns
-            do j=1,lc-1
-              lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
-              if (lcx>0) then
-                nlc = nlc+1
-                if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
-              endif
-            enddo
-
-            ! Get global dot products
-            if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-
-            ! Transform
-
-            nlc = 0
-            do j=1,lc-1
-              lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
-              if (lcx>0) then
-                nlc = nlc+1
-                a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
-              endif
-            enddo
-#endif
-          enddo
-
-          ! Calculate scalar products of stored Householder vectors.
-          ! This can be done in different ways, we use dsyrk
-
-          vav = 0
-          if (l_rows>0) &
-              call dsyrk('U','T',n_cols,l_rows,1.d0,vmr,ubound(vmr,dim=1),0.d0,vav,ubound(vav,dim=1))
-          call symm_matrix_allreduce(n_cols,vav, nbw, nbw,mpi_comm_rows)
-
-          ! Calculate triangular matrix T for block Householder Transformation
-
-          do lc=n_cols,1,-1
-            tau = tmat(lc,lc,istep)
-            if (lc<n_cols) then
-              call dtrmv('U','T','N',n_cols-lc,tmat(lc+1,lc+1,istep),ubound(tmat,dim=1),vav(lc+1,lc),1)
-              tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
-            endif
-          enddo
-        endif
-
-        ! Transpose vmr -> vmc (stored in umc, second half)
-
-        call elpa_transpose_vectors_real  (vmr, ubound(vmr,dim=1), mpi_comm_rows, &
-                                        umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, &
-                                        1, istep*nbw, n_cols, nblk)
-
-        ! Calculate umc = A**T * vmr
-        ! Note that the distributed A has to be transposed
-        ! Opposed to direct tridiagonalization there is no need to use the cache locality
-        ! of the tiles, so we can use strips of the matrix
-        !Code for Algorithm 4
-
-        n_way = 1
-#ifdef WITH_OPENMP
-        n_way = omp_get_max_threads()
-#endif
-        !umc(1:l_cols,1:n_cols) = 0.d0
-        !vmr(1:l_rows,n_cols+1:2*n_cols) = 0
-#ifdef WITH_OPENMP
-        !$omp parallel private( i,lcs,lce,lrs,lre)
-#endif
-        if (n_way > 1) then
-          !$omp do
-          do i=1,min(l_cols_tile, l_cols)
-            umc(i,1:n_cols) = 0.d0
-          enddo
-          !$omp do
-          do i=1,l_rows
-            vmr(i,n_cols+1:2*n_cols) = 0.d0
-          enddo
-          if (l_cols>0 .and. l_rows>0) then
-
-            !SYMM variant 4
-            !Partitioned Matrix Expression:
-            ! Ct = Atl Bt + Atr Bb
-            ! Cb = Atr' Bt + Abl Bb
-            !
-            !Loop invariant:
-            ! Ct = Atl Bt + Atr Bb
-            !
-            !Update:
-            ! C1 = A10'B0 + A11B1 + A21 B2
-            !
-            !This algorithm chosen because in this algoirhtm, the loop around the dgemm calls
-            !is easily parallelized, and regardless of choise of algorithm,
-            !the startup cost for parallelizing the dgemms inside the loop is too great
-
-            !$omp do schedule(static,1)
-            do i=0,(istep*nbw-1)/tile_size
-              lcs = i*l_cols_tile+1                   ! local column start
-              lce = min(l_cols, (i+1)*l_cols_tile)    ! local column end
-
-              lrs = i*l_rows_tile+1                   ! local row start
-              lre = min(l_rows, (i+1)*l_rows_tile)    ! local row end
-
-              !C1 += [A11 A12] [B1
-              !                 B2]
-              if( lre > lrs .and. l_cols > lcs ) then
-              call DGEMM('N','N', lre-lrs+1, n_cols, l_cols-lcs+1,    &
-                         1.d0, a(lrs,lcs), ubound(a,dim=1),           &
-                               umc(lcs,n_cols+1), ubound(umc,dim=1),  &
-                         0.d0, vmr(lrs,n_cols+1), ubound(vmr,dim=1))
-              endif
-
-              ! C1 += A10' B0
-              if( lce > lcs .and. i > 0 ) then
-              call DGEMM('T','N', lce-lcs+1, n_cols, lrs-1,           &
-                         1.d0, a(1,lcs),   ubound(a,dim=1),           &
-                               vmr(1,1),   ubound(vmr,dim=1),         &
-                         0.d0, umc(lcs,1), ubound(umc,dim=1))
-              endif
-            enddo
-          endif
-        else
-          umc(1:l_cols,1:n_cols) = 0.d0
-          vmr(1:l_rows,n_cols+1:2*n_cols) = 0
-          if (l_cols>0 .and. l_rows>0) then
-            do i=0,(istep*nbw-1)/tile_size
-
-              lcs = i*l_cols_tile+1
-              lce = min(l_cols,(i+1)*l_cols_tile)
-              if (lce<lcs) cycle
-
-              lre = min(l_rows,(i+1)*l_rows_tile)
-              call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,dim=1), &
-                           vmr,ubound(vmr,dim=1),1.d0,umc(lcs,1),ubound(umc,dim=1))
-
-              if (i==0) cycle
-              lre = min(l_rows,i*l_rows_tile)
-              call DGEMM('N','N',lre,n_cols,lce-lcs+1,1.d0,a(1,lcs),lda, &
-                           umc(lcs,n_cols+1),ubound(umc,dim=1),1.d0,vmr(1,n_cols+1),ubound(vmr,dim=1))
-            enddo
-          endif
-        endif
-#ifdef WITH_OPENMP
-        !$omp end parallel
-#endif
-        ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
-        ! on the processors containing the diagonal
-        ! This is only necessary if ur has been calculated, i.e. if the
-        ! global tile size is smaller than the global remaining matrix
-        ! Or if we used the Algorithm 4
-        if (tile_size < istep*nbw .or. n_way > 1) then
-        call elpa_reduce_add_vectors_real  (vmr(1,n_cols+1),ubound(vmr,dim=1),mpi_comm_rows, &
-                                            umc, ubound(umc,dim=1), mpi_comm_cols, &
-                                            istep*nbw, n_cols, nblk)
-        endif
-
-        if (l_cols>0) then
-          allocate(tmp(l_cols,n_cols))
-          call mpi_allreduce(umc,tmp,l_cols*n_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-          umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
-          deallocate(tmp)
-        endif
-
-        ! U = U * Tmat**T
-
-        call dtrmm('Right','Upper','Trans','Nonunit',l_cols,n_cols,1.d0,tmat(1,1,istep),ubound(tmat,dim=1),umc,ubound(umc,dim=1))
-
-        ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
-
-        call dgemm('T','N',n_cols,n_cols,l_cols,1.d0,umc,ubound(umc,dim=1),umc(1,n_cols+1), &
-                   ubound(umc,dim=1),0.d0,vav,ubound(vav,dim=1))
-        call dtrmm('Right','Upper','Trans','Nonunit',n_cols,n_cols,1.d0,tmat(1,1,istep),    &
-                   ubound(tmat,dim=1),vav,ubound(vav,dim=1))
-
-        call symm_matrix_allreduce(n_cols,vav, nbw, nbw ,mpi_comm_cols)
-
-        ! U = U - 0.5 * V * VAV
-        call dgemm('N','N',l_cols,n_cols,n_cols,-0.5d0,umc(1,n_cols+1),ubound(umc,dim=1),vav, &
-                    ubound(vav,dim=1),1.d0,umc,ubound(umc,dim=1))
-
-        ! Transpose umc -> umr (stored in vmr, second half)
-
-        call elpa_transpose_vectors_real  (umc, ubound(umc,dim=1), mpi_comm_cols, &
-                                       vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, &
-                                       1, istep*nbw, n_cols, nblk)
-
-        ! A = A - V*U**T - U*V**T
-#ifdef WITH_OPENMP
-        !$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend  )
-        n_threads = omp_get_num_threads()
-        if (mod(n_threads, 2) == 0) then
-            n_way = 2
-        else
-            n_way = 1
-        endif
-
-        m_way = n_threads / n_way
-
-        m_id = mod(omp_get_thread_num(),  m_way)
-        n_id = omp_get_thread_num() / m_way
-
-        do ii=n_id*tile_size,(istep*nbw-1),tile_size*n_way
-          i = ii / tile_size
-          lcs = i*l_cols_tile+1
-          lce = min(l_cols,(i+1)*l_cols_tile)
-          lre = min(l_rows,(i+1)*l_rows_tile)
-          if (lce<lcs .or. lre<1) cycle
-
-          !Figure out this thread's range
-          work_per_thread = lre / m_way
-          if (work_per_thread * m_way < lre) work_per_thread = work_per_thread + 1
-          mystart = m_id * work_per_thread + 1
-          myend   = mystart + work_per_thread - 1
-          if ( myend > lre ) myend = lre
-          if ( myend-mystart+1 < 1) cycle
-
-          call dgemm('N','T',myend-mystart+1, lce-lcs+1, 2*n_cols, -1.d0, &
-                      vmr(mystart, 1), ubound(vmr,1), umc(lcs,1), ubound(umc,1), &
-                      1.d0,a(mystart,lcs),ubound(a,1))
-        enddo
-        !$omp end parallel
-
-#else /* WITH_OPENMP */
-        do i=0,(istep*nbw-1)/tile_size
-          lcs = i*l_cols_tile+1
-          lce = min(l_cols,(i+1)*l_cols_tile)
-          lre = min(l_rows,(i+1)*l_rows_tile)
-          if (lce<lcs .or. lre<1) cycle
-          call dgemm('N','T',lre,lce-lcs+1,2*n_cols,-1.d0, &
-                      vmr,ubound(vmr,dim=1),umc(lcs,1),ubound(umc,dim=1), &
-                      1.d0,a(1,lcs),lda)
-        enddo
-#endif /* WITH_OPENMP */
-        deallocate(vmr, umc, vr)
-
-      enddo
-
-      if (useQR) then
-        if (which_qr_decomposition == 1) then
-          deallocate(work_blocked)
-          deallocate(tauvector)
-        endif
-      endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("bandred_real")
-#endif
-    end subroutine bandred_real
-
-    subroutine symm_matrix_allreduce(n,a,lda,ldb,comm)
-
-    !-------------------------------------------------------------------------------
-    !  symm_matrix_allreduce: Does an mpi_allreduce for a symmetric matrix A.
-    !  On entry, only the upper half of A needs to be set
-    !  On exit, the complete matrix is set
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-      integer(kind=ik)  :: n, lda, ldb, comm
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk)     :: a(lda,*)
-#else
-      real(kind=rk)     :: a(lda,ldb)
-#endif
-      integer(kind=ik)  :: i, nc, mpierr
-      real(kind=rk)     :: h1(n*n), h2(n*n)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("symm_matrix_allreduce")
-#endif
-
-      nc = 0
-      do i=1,n
-        h1(nc+1:nc+i) = a(1:i,i)
-        nc = nc+i
-      enddo
-
-      call mpi_allreduce(h1,h2,nc,MPI_REAL8,MPI_SUM,comm,mpierr)
-
-      nc = 0
-      do i=1,n
-        a(1:i,i) = h2(nc+1:nc+i)
-        a(i,1:i-1) = a(1:i-1,i)
-        nc = nc+i
-      enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("symm_matrix_allreduce")
-#endif
-
-    end subroutine symm_matrix_allreduce
-
-    subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, numBlocks, mpi_comm_rows, &
-                                      mpi_comm_cols, useQR)
-    !-------------------------------------------------------------------------------
-    !  trans_ev_band_to_full_real:
-    !  Transforms the eigenvectors of a band matrix back to the eigenvectors of the original matrix
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix a, number of rows of matrix q
-    !
-    !  nqc         Number of columns of matrix q
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  nbw         semi bandwith
-    !
-    !  a(lda,matrixCols)    Matrix containing the Householder vectors (i.e. matrix a after bandred_real)
-    !              Distribution is like in Scalapack.
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a and q
-    !
-    !  tmat(nbw,nbw,numBlocks) Factors returned by bandred_real
-    !
-    !  q           On input: Eigenvectors of band matrix
-    !              On output: Transformed eigenvectors
-    !              Distribution is like in Scalapack.
-    !
-    !  ldq         Leading dimension of q
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)            :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk)               :: a(lda,*), q(ldq,*), tmat(nbw,nbw,*)
-#else
-      real(kind=rk)               :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
-#endif
-      integer(kind=ik)            :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)            :: max_blocks_row, max_blocks_col, max_local_rows, &
-                                     max_local_cols
-      integer(kind=ik)            :: l_cols, l_rows, l_colh, n_cols
-      integer(kind=ik)            :: istep, lc, ncol, nrow, nb, ns
-
-      real(kind=rk), allocatable  :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
-
-      integer(kind=ik)            :: i
-
-      real(kind=rk), allocatable  :: tmat_complete(:,:), t_tmp(:,:), t_tmp2(:,:)
-      integer(kind=ik)            :: cwy_blocking, t_blocking, t_cols, t_rows
-      logical, intent(in)         :: useQR
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("trans_ev_band_to_full_real")
-#endif
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-      max_blocks_row = ((na -1)/nblk)/np_rows + 1  ! Rows of A
-      max_blocks_col = ((nqc-1)/nblk)/np_cols + 1  ! Columns of q!
-
-      max_local_rows = max_blocks_row*nblk
-      max_local_cols = max_blocks_col*nblk
-
-      ! t_blocking was formerly 2; 3 is a better choice
-      t_blocking = 3 ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once
-
-      ! we only use the t_blocking if we could call it fully, this is might be better but needs to benchmarked.
-!     if ( na >= ((t_blocking+1)*nbw) ) then
-      cwy_blocking = t_blocking * nbw
-
-      allocate(tmp1(max_local_cols*cwy_blocking))
-      allocate(tmp2(max_local_cols*cwy_blocking))
-      allocate(hvb(max_local_rows*cwy_blocking))
-      allocate(hvm(max_local_rows,cwy_blocking))
-      allocate(tmat_complete(cwy_blocking,cwy_blocking))
-      allocate(t_tmp(cwy_blocking,nbw))
-      allocate(t_tmp2(cwy_blocking,nbw))
-!      else
-!        allocate(tmp1(max_local_cols*nbw))
-!        allocate(tmp2(max_local_cols*nbw))
-!        allocate(hvb(max_local_rows*nbw))
-!        allocate(hvm(max_local_rows,nbw))
-!      endif
-
-      hvm = 0   ! Must be set to 0 !!!
-      hvb = 0   ! Safety only
-
-      l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
-
-!     if ( na >= ((t_blocking+1)*nbw) ) then
-
-      do istep=1,((na-1)/nbw-1)/t_blocking + 1
-        ! This the call when using  na >= ((t_blocking+1)*nbw)
-        !      n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step
-        ! As an alternative we add some special case handling if na < cwy_blocking
-        IF (na < cwy_blocking) THEN
-          n_cols = MAX(0, na-nbw)
-          IF ( n_cols .eq. 0 ) THEN
-            EXIT
-          END IF
-        ELSE
-          n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step
-        END IF
-
-        ! Broadcast all Householder vectors for current step compressed in hvb
-
-        nb = 0
-        ns = 0
-
-        do lc = 1, n_cols
-          ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder vector
-          nrow = ncol - nbw ! absolute number of pivot row
-
-          l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
-          l_colh = local_index(ncol  , my_pcol, np_cols, nblk, -1) ! HV local column number
-
-          if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)
-
-          nb = nb+l_rows
-
-          if (lc==n_cols .or. mod(ncol,nblk)==0) then
-            call MPI_Bcast(hvb(ns+1),nb-ns,MPI_REAL8,pcol(ncol, nblk, np_cols),mpi_comm_cols,mpierr)
-            ns = nb
-          endif
-        enddo
-
-        ! Expand compressed Householder vectors into matrix hvm
-
-        nb = 0
-        do lc = 1, n_cols
-          nrow = (istep-1)*cwy_blocking + lc ! absolute number of pivot row
-          l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
-
-          hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
-          if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1.
-
-          nb = nb+l_rows
-        enddo
-
-        l_rows = local_index(MIN(na,(istep+1)*cwy_blocking), my_prow, np_rows, nblk, -1)
-
-        ! compute tmat2 out of tmat(:,:,)
-        tmat_complete = 0
-        do i = 1, t_blocking
-          t_cols = MIN(nbw, n_cols - (i-1)*nbw)
-          if (t_cols <= 0) exit
-          t_rows = (i - 1) * nbw
-          tmat_complete(t_rows+1:t_rows+t_cols,t_rows+1:t_rows+t_cols) = tmat(1:t_cols,1:t_cols,(istep-1)*t_blocking + i)
-          if (i > 1) then
-            call dgemm('T', 'N', t_rows, t_cols, l_rows, 1.d0, hvm(1,1), max_local_rows, hvm(1,(i-1)*nbw+1), &
-                      max_local_rows, 0.d0, t_tmp, cwy_blocking)
-            call mpi_allreduce(t_tmp,t_tmp2,cwy_blocking*nbw,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-            call dtrmm('L','U','N','N',t_rows,t_cols,1.0d0,tmat_complete,cwy_blocking,t_tmp2,cwy_blocking)
-            call dtrmm('R','U','N','N',t_rows,t_cols,-1.0d0,tmat_complete(t_rows+1,t_rows+1),cwy_blocking,t_tmp2,cwy_blocking)
-            tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
-          endif
-        enddo
-
-        ! Q = Q - V * T**T * V**T * Q
-
-        if (l_rows>0) then
-          call dgemm('T','N',n_cols,l_cols,l_rows,1.d0,hvm,ubound(hvm,dim=1), &
-                     q,ldq,0.d0,tmp1,n_cols)
-        else
-          tmp1(1:l_cols*n_cols) = 0
-        endif
-        call mpi_allreduce(tmp1,tmp2,n_cols*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-
-
-        if (l_rows>0) then
-          call dtrmm('L','U','T','N',n_cols,l_cols,1.0d0,tmat_complete,cwy_blocking,tmp2,n_cols)
-          call dgemm('N','N',l_rows,l_cols,n_cols,-1.d0,hvm,ubound(hvm,dim=1), tmp2,n_cols,1.d0,q,ldq)
-        endif
-      enddo
-
-!   else 
-!
-!     do istep=1,(na-1)/nbw
-!
-!       n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
-!
-!       ! Broadcast all Householder vectors for current step compressed in hvb
-!
-!       nb = 0
-!       ns = 0
-!
-!       do lc = 1, n_cols
-!         ncol = istep*nbw + lc ! absolute column number of householder vector
-!         nrow = ncol - nbw ! absolute number of pivot row
-!
-!         l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
-!         l_colh = local_index(ncol  , my_pcol, np_cols, nblk, -1) ! HV local column number
-!
-!         if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)
-!
-!         nb = nb+l_rows
-!
-!         if (lc==n_cols .or. mod(ncol,nblk)==0) then
-!           call MPI_Bcast(hvb(ns+1),nb-ns,MPI_REAL8,pcol(ncol, nblk, np_cols),mpi_comm_cols,mpierr)
-!           ns = nb
-!         endif
-!       enddo
-!
-!       ! Expand compressed Householder vectors into matrix hvm
-!
-!       nb = 0
-!       do lc = 1, n_cols
-!         nrow = (istep-1)*nbw+lc ! absolute number of pivot row
-!         l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
-!
-!         hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
-!         if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1.
-!
-!         nb = nb+l_rows
-!       enddo
-!
-!       l_rows = local_index(MIN(na,(istep+1)*nbw), my_prow, np_rows, nblk, -1)
-!
-!       ! Q = Q - V * T**T * V**T * Q
-!
-!       if (l_rows>0) then
-!         call dgemm('T','N',n_cols,l_cols,l_rows,1.d0,hvm,ubound(hvm,dim=1), &
-!                    q,ldq,0.d0,tmp1,n_cols)
-!       else
-!         tmp1(1:l_cols*n_cols) = 0
-!       endif
-!
-!       call mpi_allreduce(tmp1,tmp2,n_cols*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-!
-!       if (l_rows>0) then
-!         call dtrmm('L','U','T','N',n_cols,l_cols,1.0d0,tmat(1,1,istep),ubound(tmat,dim=1),tmp2,n_cols)
-!         call dgemm('N','N',l_rows,l_cols,n_cols,-1.d0,hvm,ubound(hvm,dim=1), &
-!                    tmp2,n_cols,1.d0,q,ldq)
-!       endif
-!     enddo
-!   endif
-
-      deallocate(tmp1, tmp2, hvb, hvm)
-!   if ( na >= ((t_blocking+1)*nbw) ) then
-      deallocate(tmat_complete, t_tmp, t_tmp2)
-!   endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("trans_ev_band_to_full_real")
-#endif
-    end subroutine trans_ev_band_to_full_real
-
-    subroutine tridiag_band_real(na, nb, nblk, a, lda, d, e, matrixCols, hh_trans_real, &
-                                 mpi_comm_rows, mpi_comm_cols, mpi_comm)
-
-    !-------------------------------------------------------------------------------
-    ! tridiag_band_real:
-    ! Reduces a real symmetric band matrix to tridiagonal form
-    !
-    !  na          Order of matrix a
-    !
-    !  nb          Semi bandwith
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  a(lda,matrixCols)    Distributed system matrix reduced to banded form in the upper diagonal
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a
-    !
-    !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0 (output)
-    !
-    !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !  mpi_comm
-    !              MPI-Communicator for the total processor set
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik), intent(in)  ::  na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk), intent(in)     :: a(lda,*)
-#else
-      real(kind=rk), intent(in)     :: a(lda,matrixCols)
-#endif
-      real(kind=rk), intent(out)    :: d(na), e(na) ! set only on PE 0
-      real(kind=rk), intent(out), &
-          allocatable               :: hh_trans_real(:,:)
-
-      real(kind=rk)                 :: vnorm2, hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
-      real(kind=rk)                 :: hd(nb), hs(nb)
-
-      integer(kind=ik)              :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
-      integer(kind=ik)              :: my_pe, n_pes, mpierr
-      integer(kind=ik)              :: my_prow, np_rows, my_pcol, np_cols
-      integer(kind=ik)              :: ireq_ab, ireq_hv
-      integer(kind=ik)              :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
-#ifdef WITH_OPENMP
-      integer(kind=ik)              :: max_threads, my_thread, my_block_s, my_block_e, iter
-      integer(kind=ik)              :: mpi_status(MPI_STATUS_SIZE)
-      integer(kind=ik), allocatable :: mpi_statuses(:,:), global_id_tmp(:,:)
-      integer(kind=ik), allocatable :: omp_block_limits(:)
-      real(kind=rk), allocatable    :: hv_t(:,:), tau_t(:)
-#endif
-      integer(kind=ik), allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:)
-      integer(kind=ik), allocatable :: limits(:), snd_limits(:,:)
-      integer(kind=ik), allocatable :: block_limits(:)
-      real(kind=rk), allocatable    :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
-
-#ifdef WITH_OPENMP
-      integer(kind=ik)              :: omp_get_max_threads
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("tridiag_band_real")
-#endif
-
-      call mpi_comm_rank(mpi_comm,my_pe,mpierr)
-      call mpi_comm_size(mpi_comm,n_pes,mpierr)
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-      ! Get global_id mapping 2D procssor coordinates to global id
-
-      allocate(global_id(0:np_rows-1,0:np_cols-1))
-      global_id(:,:) = 0
-      global_id(my_prow, my_pcol) = my_pe
-#ifdef WITH_OPENMP
-      allocate(global_id_tmp(0:np_rows-1,0:np_cols-1))
-#endif
-
-#ifndef WITH_OPENMP
-      call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr)
-#else
-      global_id_tmp(:,:) = global_id(:,:)
-      call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr)
-      deallocate(global_id_tmp)
-#endif
-
-      ! Total number of blocks in the band:
-
-      nblocks_total = (na-1)/nb + 1
-
-      ! Set work distribution
-
-      allocate(block_limits(0:n_pes))
-      call divide_band(nblocks_total, n_pes, block_limits)
-
-      ! nblocks: the number of blocks for my task
-      nblocks = block_limits(my_pe+1) - block_limits(my_pe)
-
-      ! allocate the part of the band matrix which is needed by this PE
-      ! The size is 1 block larger than needed to avoid extensive shifts
-      allocate(ab(2*nb,(nblocks+1)*nb))
-      ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety
-
-      ! n_off: Offset of ab within band
-      n_off = block_limits(my_pe)*nb
-
-      ! Redistribute band in a to ab
-      call redist_band_real(a, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab)
-
-      ! Calculate the workload for each sweep in the back transformation
-      ! and the space requirements to hold the HH vectors
-
-      allocate(limits(0:np_rows))
-      call determine_workload(na, nb, np_rows, limits)
-      max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
-
-      num_hh_vecs = 0
-      num_chunks  = 0
-      nx = na
-      do n = 1, nblocks_total
-        call determine_workload(nx, nb, np_rows, limits)
-        local_size = limits(my_prow+1) - limits(my_prow)
-        ! add to number of householder vectors
-        ! please note: for nx==1 the one and only HH vector is 0 and is neither calculated nor send below!
-        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
-          num_hh_vecs = num_hh_vecs + local_size
-          num_chunks  = num_chunks+1
-        endif
-        nx = nx - nb
-      enddo
-
-      ! Allocate space for HH vectors
-
-      allocate(hh_trans_real(nb,num_hh_vecs))
-
-      ! Allocate and init MPI requests
-
-      allocate(ireq_hhr(num_chunks)) ! Recv requests
-      allocate(ireq_hhs(nblocks))    ! Send requests
-
-      num_hh_vecs = 0
-      num_chunks  = 0
-      nx = na
-      nt = 0
-      do n = 1, nblocks_total
-        call determine_workload(nx, nb, np_rows, limits)
-        local_size = limits(my_prow+1) - limits(my_prow)
-        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
-          num_chunks  = num_chunks+1
-          call mpi_irecv(hh_trans_real(1,num_hh_vecs+1), nb*local_size, mpi_real8, nt, &
-                           10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr)
-          num_hh_vecs = num_hh_vecs + local_size
-        endif
-        nx = nx - nb
-        if (n == block_limits(nt+1)) then
-          nt = nt + 1
-        endif
-      enddo
-
-      ireq_hhs(:) = MPI_REQUEST_NULL
-
-      ! Buffers for gathering/sending the HH vectors
-
-      allocate(hh_gath(nb,max_blk_size,nblocks)) ! gathers HH vectors
-      allocate(hh_send(nb,max_blk_size,nblocks)) ! send buffer for HH vectors
-      hh_gath(:,:,:) = 0
-      hh_send(:,:,:) = 0
-
-      ! Some counters
-
-      allocate(hh_cnt(nblocks))
-      allocate(hh_dst(nblocks))
-
-      hh_cnt(:) = 1 ! The first transfomation vector is always 0 and not calculated at all
-      hh_dst(:) = 0 ! PE number for receive
-
-      ireq_ab = MPI_REQUEST_NULL
-      ireq_hv = MPI_REQUEST_NULL
-
-      ! Limits for sending
-
-      allocate(snd_limits(0:np_rows,nblocks))
-
-      do iblk=1,nblocks
-        call determine_workload(na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
-      enddo
-
-#ifdef WITH_OPENMP
-      ! OpenMP work distribution:
-
-      max_threads = 1
-      max_threads = omp_get_max_threads()
-
-      ! For OpenMP we need at least 2 blocks for every thread
-      max_threads = MIN(max_threads, nblocks/2)
-      if (max_threads==0) max_threads = 1
-
-      allocate(omp_block_limits(0:max_threads))
-
-      ! Get the OpenMP block limits
-      call divide_band(nblocks, max_threads, omp_block_limits)
-
-      allocate(hv_t(nb,max_threads), tau_t(max_threads))
-      hv_t = 0
-      tau_t = 0
-#endif
-
-      ! ---------------------------------------------------------------------------
-      ! Start of calculations
-
-      na_s = block_limits(my_pe)*nb + 1
-
-      if (my_pe>0 .and. na_s<=na) then
-        ! send first column to previous PE
-        ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
-        ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
-        call mpi_isend(ab_s,nb+1,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
-      endif
-
-#ifdef WITH_OPENMP
-      do istep=1,na-1-block_limits(my_pe)*nb
-#else
-      do istep=1,na-1
-#endif
-
-        if (my_pe==0) then
-          n = MIN(na-na_s,nb) ! number of rows to be reduced
-          hv(:) = 0
-          tau = 0
-          ! The last step (istep=na-1) is only needed for sending the last HH vectors.
-          ! We don't want the sign of the last element flipped (analogous to the other sweeps)
-          if (istep < na-1) then
-            ! Transform first column of remaining matrix
-            vnorm2 = sum(ab(3:n+1,na_s-n_off)**2)
-            call hh_transform_real(ab(2,na_s-n_off),vnorm2,hf,tau)
-            hv(1) = 1
-            hv(2:n) = ab(3:n+1,na_s-n_off)*hf
-          endif
-          d(istep) = ab(1,na_s-n_off)
-          e(istep) = ab(2,na_s-n_off)
-          if (istep == na-1) then
-            d(na) = ab(1,na_s+1-n_off)
-            e(na) = 0
-          endif
-        else
-          if (na>na_s) then
-            ! Receive Householder vector from previous task, from PE owning subdiagonal
-#ifdef WITH_OPENMP
-            call mpi_recv(hv,nb,mpi_real8,my_pe-1,2,mpi_comm,MPI_STATUS,mpierr)
-#else
-            call mpi_recv(hv,nb,mpi_real8,my_pe-1,2,mpi_comm,MPI_STATUS_IGNORE,mpierr)
-#endif
-            tau = hv(1)
-            hv(1) = 1.
-          endif
-        endif
-
-        na_s = na_s+1
-        if (na_s-n_off > nb) then
-          ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
-          ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0
-          n_off = n_off + nb
-        endif
-
-
-#ifdef WITH_OPENMP
-        if (max_threads > 1) then
-
-          ! Codepath for OpenMP
-
-          ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread!
-          ! Every thread is one reduction cycle behind its predecessor and thus starts one step later.
-          ! This simulates the behaviour of the MPI tasks which also work after each other.
-          ! The code would be considerably easier, if the MPI communication would be made within
-          ! the parallel region - this is avoided here since this would require
-          ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program.
-
-          hv_t(:,1) = hv
-          tau_t(1) = tau
-
-          do iter = 1, 2
-
-            ! iter=1 : work on first block
-            ! iter=2 : work on remaining blocks
-            ! This is done in 2 iterations so that we have a barrier in between:
-            ! After the first iteration, it is guaranteed that the last row of the last block
-            ! is completed by the next thread.
-            ! After the first iteration it is also the place to exchange the last row
-            ! with MPI calls
-#ifdef HAVE_DETAILED_TIMINGS
-            call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
-!$omp&                    nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
-            do my_thread = 1, max_threads
-
-              if (iter == 1) then
-                my_block_s = omp_block_limits(my_thread-1) + 1
-                my_block_e = my_block_s
-              else
-                my_block_s = omp_block_limits(my_thread-1) + 2
-                my_block_e = omp_block_limits(my_thread)
-              endif
-
-              do iblk = my_block_s, my_block_e
-
-                ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block
-                ne = ns+nb-1                    ! last column in block
-
-                if (istep<my_thread .or. ns+n_off>na) exit
-
-                hv = hv_t(:,my_thread)
-                tau = tau_t(my_thread)
-
-                ! Store Householder vector for back transformation
-
-                hh_cnt(iblk) = hh_cnt(iblk) + 1
-
-                hh_gath(1   ,hh_cnt(iblk),iblk) = tau
-                hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
-
-                nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
-                nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
-                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
-
-                ! Transform diagonal block
-
-                call DSYMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,0.d0,hd,1)
-
-                x = dot_product(hv(1:nc),hd(1:nc))*tau
-                hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc)
-
-                call DSYR2('L',nc,-1.d0,hd,1,hv,1,ab(1,ns),2*nb-1)
-
-                hv_t(:,my_thread) = 0
-                tau_t(my_thread)  = 0
-
-                if (nr<=0) cycle ! No subdiagonal block present any more
-
-                ! Transform subdiagonal block
-
-                call DGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,0.d0,hs,1)
-
-                if (nr>1) then
-
-                  ! complete (old) Householder transformation for first column
-
-                  ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
-
-                  ! calculate new Householder transformation for first column
-                  ! (stored in hv_t(:,my_thread) and tau_t(my_thread))
-
-                  vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
-                  call hh_transform_real(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread))
-                  hv_t(1   ,my_thread) = 1.
-                  hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
-                  ab(nb+2:,ns) = 0
-
-                  ! update subdiagonal block for old and new Householder transformation
-                  ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
-
-                  call DGEMV('T',nr,nb-1,tau_t(my_thread),ab(nb,ns+1),2*nb-1,hv_t(1,my_thread),1,0.d0,h(2),1)
-                  x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
-                  h(2:nb) = h(2:nb) - x*hv(2:nb)
-                  ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
-                  do i=2,nb
-                    ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)*h(i) - hs(1:nr)*hv(i)
-                  enddo
-
-                else
-
-                  ! No new Householder transformation for nr=1, just complete the old one
-                  ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1
-                  do i=2,nb
-                    ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
-                  enddo
-                  ! For safety: there is one remaining dummy transformation (but tau is 0 anyways)
-                  hv_t(1,my_thread) = 1.
-
-                endif
-
-              enddo
-
-            enddo ! my_thread
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-            call timer%stop("OpenMP parallel")
-#endif
-
-            if (iter==1) then
-              ! We are at the end of the first block
-
-              ! Send our first column to previous PE
-              if (my_pe>0 .and. na_s <= na) then
-                call mpi_wait(ireq_ab,mpi_status,mpierr)
-                ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
-                call mpi_isend(ab_s,nb+1,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
-              endif
-
-              ! Request last column from next PE
-              ne = na_s + nblocks*nb - (max_threads-1) - 1
-              if (istep>=max_threads .and. ne <= na) then
-                call mpi_recv(ab(1,ne-n_off),nb+1,mpi_real8,my_pe+1,1,mpi_comm,mpi_status,mpierr)
-              endif
-
-            else
-              ! We are at the end of all blocks
-
-              ! Send last HH vector and TAU to next PE if it has been calculated above
-              ne = na_s + nblocks*nb - (max_threads-1) - 1
-              if (istep>=max_threads .and. ne < na) then
-                call mpi_wait(ireq_hv,mpi_status,mpierr)
-                hv_s(1) = tau_t(max_threads)
-                hv_s(2:) = hv_t(2:,max_threads)
-                call mpi_isend(hv_s,nb,mpi_real8,my_pe+1,2,mpi_comm,ireq_hv,mpierr)
-              endif
-
-              ! "Send" HH vector and TAU to next OpenMP thread
-              do my_thread = max_threads, 2, -1
-                hv_t(:,my_thread) = hv_t(:,my_thread-1)
-                tau_t(my_thread)  = tau_t(my_thread-1)
-              enddo
-
-            endif
-          enddo ! iter
-
-        else
-
-          ! Codepath for 1 thread without OpenMP
-
-          ! The following code is structured in a way to keep waiting times for
-          ! other PEs at a minimum, especially if there is only one block.
-          ! For this reason, it requests the last column as late as possible
-          ! and sends the Householder vector and the first column as early
-          ! as possible.
-
-#endif /* WITH_OPENMP */
-
-          do iblk=1,nblocks
-
-            ns = na_s + (iblk-1)*nb - n_off ! first column in block
-            ne = ns+nb-1                    ! last column in block
-
-            if (ns+n_off>na) exit
-
-            ! Store Householder vector for back transformation
-
-            hh_cnt(iblk) = hh_cnt(iblk) + 1
-
-            hh_gath(1   ,hh_cnt(iblk),iblk) = tau
-            hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
-
-#ifndef WITH_OPENMP
-            if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
-              ! Wait for last transfer to finish
-
-              call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
-
-              ! Copy vectors into send buffer
-              hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
-              ! Send to destination
-              call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_real8, &
-                           global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), &
-                           10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
-            ! Reset counter and increase destination row
-              hh_cnt(iblk) = 0
-              hh_dst(iblk) = hh_dst(iblk)+1
-            endif
-
-            ! The following code is structured in a way to keep waiting times for
-            ! other PEs at a minimum, especially if there is only one block.
-            ! For this reason, it requests the last column as late as possible
-            ! and sends the Householder vector and the first column as early
-            ! as possible.
-#endif
-            nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
-            nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
-                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
-
-            ! Multiply diagonal block and subdiagonal block with Householder vector
-
-            if (iblk==nblocks .and. nc==nb) then
-
-              ! We need the last column from the next PE.
-              ! First do the matrix multiplications without last column ...
-
-              ! Diagonal block, the contribution of the last element is added below!
-              ab(1,ne) = 0
-              call DSYMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,0.d0,hd,1)
-
-              ! Subdiagonal block
-              if (nr>0) call DGEMV('N',nr,nb-1,tau,ab(nb+1,ns),2*nb-1,hv,1,0.d0,hs,1)
-
-              ! ... then request last column ...
-#ifdef WITH_OPENMP
-              call mpi_recv(ab(1,ne),nb+1,mpi_real8,my_pe+1,1,mpi_comm,MPI_STATUS,mpierr)
-#else
-              call mpi_recv(ab(1,ne),nb+1,mpi_real8,my_pe+1,1,mpi_comm,MPI_STATUS_IGNORE,mpierr)
-#endif
-
-              ! ... and complete the result
-              hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb)
-              hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau
-
-            else
-
-              ! Normal matrix multiply
-              call DSYMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,0.d0,hd,1)
-              if (nr>0) call DGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,0.d0,hs,1)
-
-            endif
-
-            ! Calculate first column of subdiagonal block and calculate new
-            ! Householder transformation for this column
-
-            hv_new(:) = 0 ! Needed, last rows must be 0 for nr < nb
-            tau_new = 0
-
-            if (nr>0) then
-
-              ! complete (old) Householder transformation for first column
-
-              ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
-
-              ! calculate new Householder transformation ...
-              if (nr>1) then
-                vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
-                call hh_transform_real(ab(nb+1,ns),vnorm2,hf,tau_new)
-                hv_new(1) = 1.
-                hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf
-                ab(nb+2:,ns) = 0
-              endif
-
-              ! ... and send it away immediatly if this is the last block
-
-              if (iblk==nblocks) then
-#ifdef WITH_OPENMP
-                call mpi_wait(ireq_hv,MPI_STATUS,mpierr)
-#else
-                call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
-#endif
-                hv_s(1) = tau_new
-                hv_s(2:) = hv_new(2:)
-                call mpi_isend(hv_s,nb,mpi_real8,my_pe+1,2,mpi_comm,ireq_hv,mpierr)
-              endif
-
-            endif
-
-            ! Transform diagonal block
-            x = dot_product(hv(1:nc),hd(1:nc))*tau
-            hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc)
-
-            if (my_pe>0 .and. iblk==1) then
-
-              ! The first column of the diagonal block has to be send to the previous PE
-              ! Calculate first column only ...
-
-              ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*hv(1) - hv(1:nc)*hd(1)
-
-              ! ... send it away ...
-
-#ifdef WITH_OPENMP
-              call mpi_wait(ireq_ab,MPI_STATUS,mpierr)
-#else
-              call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
-#endif
-              ab_s(1:nb+1) = ab(1:nb+1,ns)
-              call mpi_isend(ab_s,nb+1,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
-
-              ! ... and calculate remaining columns with rank-2 update
-              if (nc>1) call DSYR2('L',nc-1,-1.d0,hd(2),1,hv(2),1,ab(1,ns+1),2*nb-1)
-            else
-              ! No need to  send, just a rank-2 update
-              call DSYR2('L',nc,-1.d0,hd,1,hv,1,ab(1,ns),2*nb-1)
-            endif
-
-            ! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb
-
-            if (nr>0) then
-              if (nr>1) then
-                call DGEMV('T',nr,nb-1,tau_new,ab(nb,ns+1),2*nb-1,hv_new,1,0.d0,h(2),1)
-                x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new
-                h(2:nb) = h(2:nb) - x*hv(2:nb)
-                ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update
-                do i=2,nb
-                  ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*h(i) - hs(1:nr)*hv(i)
-                enddo
-              else
-                ! No double Householder transformation for nr=1, just complete the row
-                do i=2,nb
-                  ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
-                enddo
-              endif
-            endif
-
-            ! Use new HH vector for the next block
-            hv(:) = hv_new(:)
-            tau = tau_new
-
-          enddo
-
-#ifdef WITH_OPENMP
-        endif
-
-        do iblk = 1, nblocks
-
-          if (hh_dst(iblk) >= np_rows) exit
-          if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit
-
-          if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
-            ! Wait for last transfer to finish
-            call mpi_wait(ireq_hhs(iblk), mpi_status, mpierr)
-            ! Copy vectors into send buffer
-            hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
-            ! Send to destination
-            call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_real8, &
-                  global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), &
-                  10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
-            ! Reset counter and increase destination row
-            hh_cnt(iblk) = 0
-            hh_dst(iblk) = hh_dst(iblk)+1
-          endif
-
-        enddo
-#endif
-      enddo
-
-      ! Finish the last outstanding requests
-#ifdef WITH_OPENMP
-      call mpi_wait(ireq_ab,MPI_STATUS,mpierr)
-      call mpi_wait(ireq_hv,MPI_STATUS,mpierr)
-
-      allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks)))
-      call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES, mpierr)
-      call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES, mpierr)
-      deallocate(mpi_statuses)
-#else
-      call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
-      call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
-
-      call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr)
-      call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr)
-#endif
-
-      call mpi_barrier(mpi_comm,mpierr)
-
-      deallocate(ab)
-      deallocate(ireq_hhr, ireq_hhs)
-      deallocate(hh_cnt, hh_dst)
-      deallocate(hh_gath, hh_send)
-      deallocate(limits, snd_limits)
-      deallocate(block_limits)
-      deallocate(global_id)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("tridiag_band_real")
-#endif
-
-    end subroutine tridiag_band_real
-
-    subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, &
-                                           mpi_comm_rows, mpi_comm_cols, wantDebug, success, &
-                                           THIS_REAL_ELPA_KERNEL)
-    !-------------------------------------------------------------------------------
-    !  trans_ev_tridi_to_band_real:
-    !  Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix a, number of rows of matrix q
-    !
-    !  nev         Number eigenvectors to compute (= columns of matrix q)
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  nb          semi bandwith
-    !
-    !  q           On input: Eigenvectors of tridiagonal matrix
-    !              On output: Transformed eigenvectors
-    !              Distribution is like in Scalapack.
-    !
-    !  ldq         Leading dimension of q
-    !  matrixCols  local columns of matrix q
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns/both
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      use pack_unpack_real
-      use compute_hh_trafo_real
-      implicit none
-
-      integer(kind=ik), intent(in)  :: THIS_REAL_ELPA_KERNEL
-      integer(kind=ik), intent(in)  :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk)                 :: q(ldq,*)
-#else
-      real(kind=rk)                 :: q(ldq,matrixCols)
-#endif
-      real(kind=rk), intent(out)    :: hh_trans_real(:,:)
-      integer(kind=ik)              :: np_rows, my_prow, np_cols, my_pcol
-
-      integer(kind=ik)              :: i, j, ip, sweep, nbuf, l_nev, a_dim2
-      integer(kind=ik)              :: current_n, current_local_n, current_n_start, current_n_end
-      integer(kind=ik)              :: next_n, next_local_n, next_n_start, next_n_end
-      integer(kind=ik)              :: bottom_msg_length, top_msg_length, next_top_msg_length
-      integer(kind=ik)              :: stripe_width, last_stripe_width, stripe_count
-#ifdef WITH_OPENMP
-      integer(kind=ik)              :: thread_width, csw, b_off, b_len
-#endif
-      integer(kind=ik)              :: num_result_blocks, num_result_buffers, num_bufs_recvd
-      integer(kind=ik)              :: a_off, current_tv_off, max_blk_size
-      integer(kind=ik)              :: mpierr, src, src_offset, dst, offset, nfact, num_blk
-#ifdef WITH_OPENMP
-      integer(kind=ik)              :: mpi_status(MPI_STATUS_SIZE)
-#endif
-      logical                       :: flag
-
-#ifdef WITH_OPENMP
-      real(kind=rk), allocatable    :: a(:,:,:,:), row(:)
-#else
-      real(kind=rk), allocatable    :: a(:,:,:), row(:)
-#endif
-
-#ifdef WITH_OPENMP
-      real(kind=rk), allocatable    :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
-      real(kind=rk), allocatable    :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
-#else
-      real(kind=rk), allocatable    :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
-      real(kind=rk), allocatable    :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
-#endif
-      real(kind=rk), allocatable    :: result_buffer(:,:,:)
-      real(kind=rk), allocatable    :: bcast_buffer(:,:)
-
-      integer(kind=ik)              :: n_off
-      integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:)
-      integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:)
-      integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:)
-#ifdef WITH_OPENMP
-      integer(kind=ik), allocatable :: mpi_statuses(:,:)
-#endif
-      ! MPI send/recv tags, arbitrary
-
-      integer(kind=ik), parameter  :: bottom_recv_tag = 111
-      integer(kind=ik), parameter  :: top_recv_tag    = 222
-      integer(kind=ik), parameter  :: result_recv_tag = 333
-
-      ! Just for measuring the kernel performance
-      real(kind=rk)                :: kernel_time
-      ! long integer
-      integer(kind=lik)            :: kernel_flops
-
-#ifdef WITH_OPENMP
-      integer(kind=ik)             :: max_threads, my_thread
-      integer(kind=ik)             :: omp_get_max_threads
-#endif
-
-      logical, intent(in)          :: wantDebug
-      logical                      :: success
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("trans_ev_tridi_to_band_real")
-#endif
-      success = .true.
-      kernel_time = 1.d-100
-      kernel_flops = 0
-
-#ifdef WITH_OPENMP
-      max_threads = 1
-      max_threads = omp_get_max_threads()
-#endif
-
-      call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr)
-      call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr)
-      call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr)
-      call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr)
-
-      if (mod(nbw,nblk)/=0) then
-        if (my_prow==0 .and. my_pcol==0) then
-          if (wantDebug) then
-            write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: ERROR: nbw=',nbw,', nblk=',nblk
-            write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: band backtransform works only for nbw==n*nblk'
-          endif
-          success = .false.
-          return
-        endif
-      endif
-
-      nfact = nbw / nblk
-
-
-      ! local number of eigenvectors
-      l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)
-
-      if (l_nev==0) then
-#ifdef WITH_OPENMP
-        thread_width = 0
-#endif
-        stripe_width = 0
-        stripe_count = 0
-        last_stripe_width = 0
-      else
-        ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
-        ! every primary cache
-#ifdef WITH_OPENMP
-        thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
-#endif
-        stripe_width = 48 ! Must be a multiple of 4
-#ifdef WITH_OPENMP
-        stripe_count = (thread_width-1)/stripe_width + 1
-#else
-        stripe_count = (l_nev-1)/stripe_width + 1
-#endif
-        ! Adapt stripe width so that last one doesn't get too small
-#ifdef WITH_OPENMP
-        stripe_width = (thread_width-1)/stripe_count + 1
-#else
-        stripe_width = (l_nev-1)/stripe_count + 1
-#endif
-        stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 !!!
-        last_stripe_width = l_nev - (stripe_count-1)*stripe_width
-      endif
-
-      ! Determine the matrix distribution at the beginning
-
-      allocate(limits(0:np_rows))
-
-      call determine_workload(na, nbw, np_rows, limits)
-
-      max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
-
-      a_dim2 = max_blk_size + nbw
-
-!DEC$ ATTRIBUTES ALIGN: 64:: a
-#ifdef WITH_OPENMP
-      allocate(a(stripe_width,a_dim2,stripe_count,max_threads))
-      ! a(:,:,:,:) should be set to 0 in a parallel region, not here!
-#else
-      allocate(a(stripe_width,a_dim2,stripe_count))
-      a(:,:,:) = 0
-#endif
-
-      allocate(row(l_nev))
-      row(:) = 0
-
-      ! Copy q from a block cyclic distribution into a distribution with contiguous rows,
-      ! and transpose the matrix using stripes of given stripe_width for cache blocking.
-
-      ! The peculiar way it is done below is due to the fact that the last row should be
-      ! ready first since it is the first one to start below
-
-#ifdef WITH_OPENMP
-      ! Please note about the OMP usage below:
-      ! This is not for speed, but because we want the matrix a in the memory and
-      ! in the cache of the correct thread (if possible)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("OpenMP parallel")
-#endif
-
-      !$omp parallel do private(my_thread), schedule(static, 1)
-      do my_thread = 1, max_threads
-        a(:,:,:,my_thread) = 0 ! if possible, do first touch allocation!
-      enddo
-      !$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("OpenMP parallel")
-#endif
-#endif
-
-      do ip = np_rows-1, 0, -1
-        if (my_prow == ip) then
-          ! Receive my rows which have not yet been received
-          src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
-          do i=limits(ip)+1,limits(ip+1)
-            src = mod((i-1)/nblk, np_rows)
-            if (src < my_prow) then
-#ifdef WITH_OPENMP
-              call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS, mpierr)
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread), schedule(static, 1)
-              do my_thread = 1, max_threads
-                call unpack_row_real_cpu_openmp(a, row,i-limits(ip),my_thread, stripe_count, &
-                                                thread_width, stripe_width, l_nev)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else
-              call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
-              call unpack_row_real_cpu(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
-#endif
-            elseif (src==my_prow) then
-              src_offset = src_offset+1
-              row(:) = q(src_offset, 1:l_nev)
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread), schedule(static, 1)
-              do my_thread = 1, max_threads
-                call unpack_row_real_cpu_openmp(a, row,i-limits(ip),my_thread, &
-                                                stripe_count, thread_width, stripe_width, l_nev)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else
-              call unpack_row_real_cpu(a, row,i-limits(ip),  stripe_count, stripe_width, last_stripe_width)
-#endif
-            endif
-          enddo
-          ! Send all rows which have not yet been send
-          src_offset = 0
-          do dst = 0, ip-1
-            do i=limits(dst)+1,limits(dst+1)
-              if (mod((i-1)/nblk, np_rows) == my_prow) then
-                src_offset = src_offset+1
-                row(:) = q(src_offset, 1:l_nev)
-                call MPI_Send(row, l_nev, MPI_REAL8, dst, 0, mpi_comm_rows, mpierr)
-              endif
-            enddo
-          enddo
-          else if (my_prow < ip) then
-            ! Send all rows going to PE ip
-            src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
-            do i=limits(ip)+1,limits(ip+1)
-              src = mod((i-1)/nblk, np_rows)
-              if (src == my_prow) then
-                src_offset = src_offset+1
-                row(:) = q(src_offset, 1:l_nev)
-                call MPI_Send(row, l_nev, MPI_REAL8, ip, 0, mpi_comm_rows, mpierr)
-              endif
-            enddo
-            ! Receive all rows from PE ip
-            do i=limits(my_prow)+1,limits(my_prow+1)
-              src = mod((i-1)/nblk, np_rows)
-              if (src == ip) then
-#ifdef WITH_OPENMP
-              call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS, mpierr)
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread), schedule(static, 1)
-              do my_thread = 1, max_threads
-                call unpack_row_real_cpu_openmp(a, row,i-limits(my_prow),my_thread, &
-                                                stripe_count, thread_width, stripe_width, l_nev)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else
-              call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
-              call unpack_row_real_cpu(a, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
-#endif
-            endif
-          enddo
-        endif
-      enddo
-
-      ! Set up result buffer queue
-
-      num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows
-
-      num_result_buffers = 4*nfact
-      allocate(result_buffer(l_nev,nblk,num_result_buffers))
-
-      allocate(result_send_request(num_result_buffers))
-      allocate(result_recv_request(num_result_buffers))
-      result_send_request(:) = MPI_REQUEST_NULL
-      result_recv_request(:) = MPI_REQUEST_NULL
-
-      ! Queue up buffers
-
-      if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
-        do j = 1, min(num_result_buffers, num_result_blocks)
-          call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_REAL8, 0, result_recv_tag, &
-                              mpi_comm_rows, result_recv_request(j), mpierr)
-        enddo
-      endif
-
-      num_bufs_recvd = 0 ! No buffers received yet
-
-      ! Initialize top/bottom requests
-
-      allocate(top_send_request(stripe_count))
-      allocate(top_recv_request(stripe_count))
-      allocate(bottom_send_request(stripe_count))
-      allocate(bottom_recv_request(stripe_count))
-
-      top_send_request(:) = MPI_REQUEST_NULL
-      top_recv_request(:) = MPI_REQUEST_NULL
-      bottom_send_request(:) = MPI_REQUEST_NULL
-      bottom_recv_request(:) = MPI_REQUEST_NULL
-
-#ifdef WITH_OPENMP
-      allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count))
-      allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count))
-      allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count))
-      allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count))
-
-      top_border_send_buffer(:,:) = 0
-      top_border_recv_buffer(:,:) = 0
-      bottom_border_send_buffer(:,:) = 0
-      bottom_border_recv_buffer(:,:) = 0
-
-      ! Initialize broadcast buffer
-#else
-      allocate(top_border_send_buffer(stripe_width, nbw, stripe_count))
-      allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count))
-      allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count))
-      allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count))
-
-      top_border_send_buffer(:,:,:) = 0
-      top_border_recv_buffer(:,:,:) = 0
-      bottom_border_send_buffer(:,:,:) = 0
-      bottom_border_recv_buffer(:,:,:) = 0
-#endif
-
-      allocate(bcast_buffer(nbw, max_blk_size))
-      bcast_buffer = 0
-
-      current_tv_off = 0 ! Offset of next row to be broadcast
-
-       ! ------------------- start of work loop -------------------
-
-      a_off = 0 ! offset in A (to avoid unnecessary shifts)
-
-      top_msg_length = 0
-      bottom_msg_length = 0
-
-      do sweep = 0, (na-1)/nbw
-
-        current_n = na - sweep*nbw
-        call determine_workload(current_n, nbw, np_rows, limits)
-        current_n_start = limits(my_prow)
-        current_n_end   = limits(my_prow+1)
-        current_local_n = current_n_end - current_n_start
-
-        next_n = max(current_n - nbw, 0)
-        call determine_workload(next_n, nbw, np_rows, limits)
-        next_n_start = limits(my_prow)
-        next_n_end   = limits(my_prow+1)
-        next_local_n = next_n_end - next_n_start
-
-        if (next_n_end < next_n) then
-          bottom_msg_length = current_n_end - next_n_end
-        else
-          bottom_msg_length = 0
-        endif
-
-        if (next_local_n > 0) then
-          next_top_msg_length = current_n_start - next_n_start
-        else
-          next_top_msg_length = 0
-        endif
-
-        if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
-          do i = 1, stripe_count
-#ifdef WITH_OPENMP
-            csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width"
-            b_len = csw*nbw*max_threads
-            call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_REAL8, my_prow+1, bottom_recv_tag, &
-                           mpi_comm_rows, bottom_recv_request(i), mpierr)
-#else
-            call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow+1, bottom_recv_tag, &
-                        mpi_comm_rows, bottom_recv_request(i), mpierr)
-#endif
-          enddo
-        endif
-
-        if (current_local_n > 1) then
-          if (my_pcol == mod(sweep,np_cols)) then
-            bcast_buffer(:,1:current_local_n) = hh_trans_real(:,current_tv_off+1:current_tv_off+current_local_n)
-            current_tv_off = current_tv_off + current_local_n
-          endif
-          call mpi_bcast(bcast_buffer, nbw*current_local_n, MPI_REAL8, mod(sweep,np_cols), mpi_comm_cols, mpierr)
-        else
-          ! for current_local_n == 1 the one and only HH vector is 0 and not stored in hh_trans_real
-          bcast_buffer(:,1) = 0
-        endif
-
-        if (l_nev == 0) cycle
-
-        if (current_local_n > 0) then
-
-          do i = 1, stripe_count
-#ifdef WITH_OPENMP
-            ! Get real stripe width for strip i;
-            ! The last OpenMP tasks may have an even smaller stripe with,
-            ! but we don't care about this, i.e. we send/recv a bit too much in this case.
-            ! csw: current_stripe_width
-
-            csw = min(stripe_width, thread_width-(i-1)*stripe_width)
-#endif
-            !wait_b
-            if (current_n_end < current_n) then
-#ifdef WITH_OPENMP
-              call MPI_Wait(bottom_recv_request(i), MPI_STATUS, mpierr)
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
-              do my_thread = 1, max_threads
-                n_off = current_local_n+a_off
-                b_len = csw*nbw
-                b_off = (my_thread-1)*b_len
-                a(1:csw,n_off+1:n_off+nbw,i,my_thread) = &
-                  reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /))
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else
-              call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr)
-              n_off = current_local_n+a_off
-              a(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i)
-
-#endif
-              if (next_n_end < next_n) then
-#ifdef WITH_OPENMP
-                call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, &
-                                   MPI_REAL8, my_prow+1, bottom_recv_tag, &
-                                   mpi_comm_rows, bottom_recv_request(i), mpierr)
-#else
-                call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow+1, bottom_recv_tag, &
-
-                                   mpi_comm_rows, bottom_recv_request(i), mpierr)
-#endif
-              endif
-            endif
-
-            if (current_local_n <= bottom_msg_length + top_msg_length) then
-
-              !wait_t
-              if (top_msg_length>0) then
-#ifdef WITH_OPENMP
-                call MPI_Wait(top_recv_request(i), MPI_STATUS, mpierr)
-#else
-                call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
-                a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
-#endif
-              endif
-
-              !compute
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
-              do my_thread = 1, max_threads
-                if (top_msg_length>0) then
-                  b_len = csw*top_msg_length
-                  b_off = (my_thread-1)*b_len
-                  a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
-                             reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
-                endif
-                call compute_hh_trafo_real_cpu_openmp(a,stripe_width,a_dim2,stripe_count, max_threads, &
-                    a_off, nbw, max_blk_size, bcast_buffer,  kernel_flops, kernel_time, &
-0, current_local_n, i, my_thread, &
-                                         THIS_REAL_ELPA_KERNEL)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else
-              call compute_hh_trafo_real_cpu(a, stripe_width,a_dim2,stripe_count, &
-                  a_off,  nbw, max_blk_size, bcast_buffer,  kernel_flops, kernel_time, &
-0, current_local_n, i, &
-                                     last_stripe_width, THIS_REAL_ELPA_KERNEL)
-#endif
-              !send_b
-#ifdef WITH_OPENMP
-              call MPI_Wait(bottom_send_request(i), mpi_status, mpierr)
-              if (bottom_msg_length>0) then
-                n_off = current_local_n+nbw-bottom_msg_length+a_off
-                b_len = csw*bottom_msg_length*max_threads
-                bottom_border_send_buffer(1:b_len,i) = &
-                    reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
-                call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_REAL8, my_prow+1, &
-                               top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
-              endif
-#else
-              call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
-              if (bottom_msg_length>0) then
-                n_off = current_local_n+nbw-bottom_msg_length+a_off
-                bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i)
-                call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_REAL8, my_prow+1, &
-
-
-                               top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
-              endif
-#endif
-            else
-
-              !compute
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
-              do my_thread = 1, max_threads
-                call compute_hh_trafo_real_cpu_openmp(a, stripe_width,a_dim2,stripe_count, max_threads, &
-                    a_off, nbw, max_blk_size,  bcast_buffer, kernel_flops, kernel_time, &
-current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, &
-                                    THIS_REAL_ELPA_KERNEL)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-              !send_b
-              call MPI_Wait(bottom_send_request(i), mpi_status, mpierr)
-              if (bottom_msg_length > 0) then
-                n_off = current_local_n+nbw-bottom_msg_length+a_off
-                b_len = csw*bottom_msg_length*max_threads
-                bottom_border_send_buffer(1:b_len,i) = &
-                    reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
-                call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_REAL8, my_prow+1, &
-                                 top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
-              endif
-#else
-              call compute_hh_trafo_real_cpu(a, stripe_width,a_dim2,stripe_count, &
-                  a_off,  nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-current_local_n - bottom_msg_length, bottom_msg_length, i, &
-                                       last_stripe_width, THIS_REAL_ELPA_KERNEL)
-
-              !send_b
-              call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
-              if (bottom_msg_length > 0) then
-                n_off = current_local_n+nbw-bottom_msg_length+a_off
-                bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i)
-                call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_REAL8, my_prow+1, &
-                               top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
-              endif
-#endif
-
-              !compute
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread), schedule(static, 1)
-              do my_thread = 1, max_threads
-                call compute_hh_trafo_real_cpu_openmp(a,stripe_width,a_dim2,stripe_count, max_threads, &
-                    a_off,  nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-                    top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, my_thread, &
-                                      THIS_REAL_ELPA_KERNEL)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else
-              call compute_hh_trafo_real_cpu(a, stripe_width,a_dim2,stripe_count, &
-                  a_off,  nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
-                              last_stripe_width, THIS_REAL_ELPA_KERNEL)
-
-#endif
-              !wait_t
-              if (top_msg_length>0) then
-#ifdef WITH_OPENMP
-                call MPI_Wait(top_recv_request(i), mpi_status, mpierr)
-#else
-                call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
-                a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
-#endif
-              endif
-
-              !compute
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
-              do my_thread = 1, max_threads
-                if (top_msg_length>0) then
-                  b_len = csw*top_msg_length
-                  b_off = (my_thread-1)*b_len
-                  a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
-                    reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
-                endif
-                call compute_hh_trafo_real_cpu_openmp(a, stripe_width,a_dim2,stripe_count, max_threads, &
-                    a_off, nbw, max_blk_size,  bcast_buffer, kernel_flops, kernel_time, &
-                                                      0, top_msg_length, i, my_thread, THIS_REAL_ELPA_KERNEL)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else
-              call compute_hh_trafo_real_cpu(a, stripe_width,a_dim2,stripe_count, &
-                  a_off, nbw, max_blk_size,  bcast_buffer,  kernel_flops, kernel_time, &
-                                             0, top_msg_length, i,  &
-                                             last_stripe_width, THIS_REAL_ELPA_KERNEL)
-#endif
-            endif
-
-            if (next_top_msg_length > 0) then
-              !request top_border data
-#ifdef WITH_OPENMP
-              b_len = csw*next_top_msg_length*max_threads
-              call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_REAL8, my_prow-1, &
-                             top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
-#else
-              call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, MPI_REAL8, my_prow-1, &
-                             top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
-#endif
-            endif
-
-            !send_t
-            if (my_prow > 0) then
-#ifdef WITH_OPENMP
-              call MPI_Wait(top_send_request(i), mpi_status, mpierr)
-              b_len = csw*nbw*max_threads
-              top_border_send_buffer(1:b_len,i) = reshape(a(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /))
-              call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_REAL8, &
-                             my_prow-1, bottom_recv_tag, &
-                             mpi_comm_rows, top_send_request(i), mpierr)
-#else
-             call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
-             top_border_send_buffer(:,1:nbw,i) = a(:,a_off+1:a_off+nbw,i)
-             call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow-1, bottom_recv_tag, &
-                            mpi_comm_rows, top_send_request(i), mpierr)
-
-#endif
-           endif
-
-           ! Care that there are not too many outstanding top_recv_request's
-           if (stripe_count > 1) then
-             if (i>1) then
-#ifdef WITH_OPENMP
-               call MPI_Wait(top_recv_request(i-1), MPI_STATUS, mpierr)
-#else
-               call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr)
-#endif
-             else
-#ifdef WITH_OPENMP
-               call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS, mpierr)
-#else
-               call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr)
-#endif
-             endif
-           endif
-         enddo
-
-         top_msg_length = next_top_msg_length
-       else
-         ! wait for last top_send_request
-         do i = 1, stripe_count
-#ifdef WITH_OPENMP
-           call MPI_Wait(top_send_request(i), MPI_STATUS, mpierr)
-#else
-           call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-         enddo
-       endif
-
-       ! Care about the result
-
-       if (my_prow == 0) then
-
-         ! topmost process sends nbw rows to destination processes
-
-         do j=0,nfact-1
-           num_blk = sweep*nfact+j ! global number of destination block, 0 based
-           if (num_blk*nblk >= na) exit
-
-             nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block
-
-#ifdef WITH_OPENMP
-             call MPI_Wait(result_send_request(nbuf), MPI_STATUS, mpierr)
-#else
-             call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr)
-#endif
-             dst = mod(num_blk, np_rows)
-
-             if (dst == 0) then
-               do i = 1, min(na - num_blk*nblk, nblk)
-#ifdef  WITH_OPENMP
-                 call pack_row_real_cpu_openmp(a, row, j*nblk+i+a_off, stripe_width, &
-                                               stripe_count, max_threads, thread_width, l_nev)
-#else
-                 call pack_row_real_cpu(a, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
-
-#endif
-                 q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:)
-               enddo
-             else
-               do i = 1, nblk
-#ifdef WITH_OPENMP
-                call pack_row_real_cpu_openmp(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, &
-                                              stripe_width, stripe_count, max_threads, thread_width, l_nev)
-#else
-                call pack_row_real_cpu(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
-
-#endif
-               enddo
-               call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_REAL8, dst, &
-                                    result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr)
-             endif
-           enddo
-
-         else
-
-           ! receive and store final result
-
-           do j = num_bufs_recvd, num_result_blocks-1
-
-             nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block
-
-             ! If there is still work to do, just test for the next result request
-             ! and leave the loop if it is not ready, otherwise wait for all
-             ! outstanding requests
-
-             if (next_local_n > 0) then
-#ifdef WITH_OPENMP
-              call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS, mpierr)
-#else
-              call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr)
-
-#endif
-              if (.not.flag) exit
-            else
-#ifdef WITH_OPENMP
-              call MPI_Wait(result_recv_request(nbuf), MPI_STATUS, mpierr)
-#else
-              call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr)
-#endif
-            endif
-
-            ! Fill result buffer into q
-            num_blk = j*np_rows + my_prow ! global number of current block, 0 based
-            do i = 1, min(na - num_blk*nblk, nblk)
-              q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf)
-            enddo
-
-            ! Queue result buffer again if there are outstanding blocks left
-            if (j+num_result_buffers < num_result_blocks) &
-                      call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_REAL8, 0, result_recv_tag, &
-                                     mpi_comm_rows, result_recv_request(nbuf), mpierr)
-
-          enddo
-          num_bufs_recvd = j
-
-        endif
-
-        ! Shift the remaining rows to the front of A (if necessary)
-
-        offset = nbw - top_msg_length
-        if (offset<0) then
-          if (wantDebug) write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: internal error, offset for shifting = ',offset
-          success = .false.
-          return
-        endif
-
-        a_off = a_off + offset
-        if (a_off + next_local_n + nbw > a_dim2) then
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%start("OpenMP parallel")
-#endif
-
- !$omp parallel do private(my_thread, i, j), schedule(static, 1)
-          do my_thread = 1, max_threads
-            do i = 1, stripe_count
-              do j = top_msg_length+1, top_msg_length+next_local_n
-                A(:,j,i,my_thread) = A(:,j+a_off,i,my_thread)
-              enddo
-#else
-          do i = 1, stripe_count
-            do j = top_msg_length+1, top_msg_length+next_local_n
-              A(:,j,i) = A(:,j+a_off,i)
-#endif
-            enddo
-          enddo
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%stop("OpenMP parallel")
-#endif
-#endif
-          a_off = 0
-        endif
-
-      enddo
-
-      ! Just for safety:
-      if (ANY(top_send_request    /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_send_request ***',my_prow,my_pcol
-      if (ANY(bottom_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_send_request ***',my_prow,my_pcol
-      if (ANY(top_recv_request    /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_recv_request ***',my_prow,my_pcol
-      if (ANY(bottom_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_recv_request ***',my_prow,my_pcol
-
-      if (my_prow == 0) then
-#ifdef WITH_OPENMP
-        allocate(mpi_statuses(MPI_STATUS_SIZE,num_result_buffers))
-        call MPI_Waitall(num_result_buffers, result_send_request, mpi_statuses, mpierr)
-        deallocate(mpi_statuses)
-#else
-        call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr)
-#endif
-      endif
-
-      if (ANY(result_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_send_request ***',my_prow,my_pcol
-      if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol
-
-      if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-         write(error_unit,'(" Kernel time:",f10.3," MFlops: ",f10.3)')  kernel_time, kernel_flops/kernel_time*1.d-6
-
-      ! deallocate all working space
-
-      deallocate(a)
-      deallocate(row)
-      deallocate(limits)
-      deallocate(result_send_request)
-      deallocate(result_recv_request)
-      deallocate(top_border_send_buffer)
-      deallocate(top_border_recv_buffer)
-      deallocate(bottom_border_send_buffer)
-      deallocate(bottom_border_recv_buffer)
-      deallocate(result_buffer)
-      deallocate(bcast_buffer)
-      deallocate(top_send_request)
-      deallocate(top_recv_request)
-      deallocate(bottom_send_request)
-      deallocate(bottom_recv_request)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("trans_ev_tridi_to_band_real")
-#endif
-      return
-    end subroutine  trans_ev_tridi_to_band_real
-
-    subroutine single_hh_trafo(q, hh, nb, nq, ldq)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      ! Perform single real Householder transformation.
-      ! This routine is not performance critical and thus it is coded here in Fortran
-
-      implicit none
-      integer(kind=ik)  :: nb, nq, ldq
-      real(kind=rk)     :: q(ldq, *), hh(*)
-
-      integer(kind=ik)  :: i
-      real(kind=rk)     :: v(nq)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("single_hh_trafo")
-#endif
-
-      ! v = q * hh
-      v(:) = q(1:nq,1)
-      do i=2,nb
-        v(:) = v(:) + q(1:nq,i) * hh(i)
-      enddo
-
-      ! v = v * tau
-      v(:) = v(:) * hh(1)
-
-      ! q = q - v * hh**T
-      q(1:nq,1) = q(1:nq,1) - v(:)
-      do i=2,nb
-        q(1:nq,i) = q(1:nq,i) - v(:) * hh(i)
-      enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("single_hh_trafo")
-#endif
-    end subroutine
-
-    subroutine determine_workload(na, nb, nprocs, limits)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik), intent(in)  :: na, nb, nprocs
-      integer(kind=ik), intent(out) :: limits(0:nprocs)
-
-      integer(kind=ik)              :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("determine_workload")
-#endif
-
-      if (na <= 0) then
-        limits(:) = 0
-        call timer%stop("determine_workload")
-        return
-      endif
-
-      if (nb*nprocs > na) then
-          ! there is not enough work for all
-        do i = 0, nprocs
-          limits(i) = min(na, i*nb)
-        enddo
-      else
-         do i = 0, nprocs
-           limits(i) = (i*na)/nprocs
-         enddo
-      endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("determine_workload")
-#endif
-    end subroutine
-
-    subroutine bandred_complex(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, wantDebug, success)
-
-    !-------------------------------------------------------------------------------
-    !  bandred_complex: Reduces a distributed hermitian matrix to band form
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix
-    !
-    !  a(lda,matrixCols)    Distributed matrix which should be reduced.
-    !              Distribution is like in Scalapack.
-    !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
-    !              a(:,:) is overwritten on exit with the band and the Householder vectors
-    !              in the upper half.
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  nbw         semi bandwith of output matrix
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
-    !              Factors for the Householder vectors (returned), needed for back transformation
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)              :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      complex(kind=ck)              :: a(lda,*), tmat(nbw,nbw,*)
-#else
-      complex(kind=ck)              :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
-#endif
-      complex(kind=ck), parameter   :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
-
-      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)              :: l_cols, l_rows
-      integer(kind=ik)              :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
-      integer(kind=ik)              :: istep, ncol, lch, lcx, nlc
-      integer(kind=ik)              :: tile_size, l_rows_tile, l_cols_tile
-
-      real(kind=rk)                 :: vnorm2
-      complex(kind=ck)              :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
-
-      complex(kind=ck), allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
-
-      logical, intent(in)           :: wantDebug
-      logical, intent(out)          :: success
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("bandred_complex")
-#endif
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-      success = .true.
-
-      ! Semibandwith nbw must be a multiple of blocksize nblk
-
-      if (mod(nbw,nblk)/=0) then
-        if (my_prow==0 .and. my_pcol==0) then
-          if (wantDebug) then
-            write(error_unit,*) 'ELPA2_bandred_complex: ERROR: nbw=',nbw,', nblk=',nblk
-            write(error_unit,*) 'ELPA2_bandred_complex: ELPA2 works only for nbw==n*nblk'
-          endif
-          success = .false.
-          return
-        endif
-      endif
-
-      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
-
-      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
-      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
-
-      l_rows_tile = tile_size/np_rows ! local rows of a tile
-      l_cols_tile = tile_size/np_cols ! local cols of a tile
-
-      do istep = (na-1)/nbw, 1, -1
-
-        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
-
-        ! Number of local columns/rows of remaining matrix
-        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
-        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
-
-        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
-
-        allocate(vmr(max(l_rows,1),2*n_cols))
-        allocate(umc(max(l_cols,1),2*n_cols))
-
-        allocate(vr(l_rows+1))
-
-        vmr(1:l_rows,1:n_cols) = 0.
-        vr(:) = 0
-        tmat(:,:,istep) = 0
-
-        ! Reduce current block to lower triangular form
-
-        do lc = n_cols, 1, -1
-
-          ncol = istep*nbw + lc ! absolute column number of householder vector
-          nrow = ncol - nbw ! Absolute number of pivot row
-
-          lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
-          lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
-
-          tau = 0
-
-          if(nrow == 1) exit ! Nothing to do
-
-          cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
-
-          if (my_pcol==cur_pcol) then
-
-            ! Get vector to be transformed; distribute last element and norm of
-            ! remaining elements to all procs in current column
-
-            vr(1:lr) = a(1:lr,lch) ! vector to be transformed
-
-            if (my_prow==prow(nrow, nblk, np_rows)) then
-              aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
-              aux1(2) = vr(lr)
-            else
-              aux1(1) = dot_product(vr(1:lr),vr(1:lr))
-              aux1(2) = 0.
-            endif
-
-            call mpi_allreduce(aux1,aux2,2,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
-
-            vnorm2 = aux2(1)
-            vrl    = aux2(2)
-
-            ! Householder transformation
-
-            call hh_transform_complex(vrl, vnorm2, xf, tau)
-
-            ! Scale vr and store Householder vector for back transformation
-
-            vr(1:lr) = vr(1:lr) * xf
-            if (my_prow==prow(nrow, nblk, np_rows)) then
-              a(1:lr-1,lch) = vr(1:lr-1)
-              a(lr,lch) = vrl
-              vr(lr) = 1.
-            else
-              a(1:lr,lch) = vr(1:lr)
-            endif
-
-          endif
-
-          ! Broadcast Householder vector and tau along columns
-
-          vr(lr+1) = tau
-          call MPI_Bcast(vr,lr+1,MPI_DOUBLE_COMPLEX,cur_pcol,mpi_comm_cols,mpierr)
-          vmr(1:lr,lc) = vr(1:lr)
-          tau = vr(lr+1)
-          tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
-
-          ! Transform remaining columns in current block with Householder vector
-
-          ! Local dot product
-
-          aux1 = 0
-
-          nlc = 0 ! number of local columns
-          do j=1,lc-1
-            lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
-            if (lcx>0) then
-              nlc = nlc+1
-              aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
-            endif
-          enddo
-
-          ! Get global dot products
-          if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
-
-          ! Transform
-
-          nlc = 0
-          do j=1,lc-1
-            lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
-            if (lcx>0) then
-              nlc = nlc+1
-              a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
-            endif
-          enddo
-
-        enddo
-
-        ! Calculate scalar products of stored Householder vectors.
-        ! This can be done in different ways, we use zherk
-
-        vav = 0
-        if (l_rows>0) &
-           call zherk('U','C',n_cols,l_rows,CONE,vmr,ubound(vmr,dim=1),CZERO,vav,ubound(vav,dim=1))
-        call herm_matrix_allreduce(n_cols,vav, nbw,nbw,mpi_comm_rows)
-
-        ! Calculate triangular matrix T for block Householder Transformation
-
-        do lc=n_cols,1,-1
-          tau = tmat(lc,lc,istep)
-          if (lc<n_cols) then
-            call ztrmv('U','C','N',n_cols-lc,tmat(lc+1,lc+1,istep),ubound(tmat,dim=1),vav(lc+1,lc),1)
-            tmat(lc,lc+1:n_cols,istep) = -tau * conjg(vav(lc+1:n_cols,lc))
-          endif
-        enddo
-
-        ! Transpose vmr -> vmc (stored in umc, second half)
-
-        call elpa_transpose_vectors_complex  (vmr, ubound(vmr,dim=1), mpi_comm_rows, &
-                                      umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, &
-                                      1, istep*nbw, n_cols, nblk)
-
-        ! Calculate umc = A**T * vmr
-        ! Note that the distributed A has to be transposed
-        ! Opposed to direct tridiagonalization there is no need to use the cache locality
-        ! of the tiles, so we can use strips of the matrix
-
-        umc(1:l_cols,1:n_cols) = 0.d0
-        vmr(1:l_rows,n_cols+1:2*n_cols) = 0
-        if (l_cols>0 .and. l_rows>0) then
-          do i=0,(istep*nbw-1)/tile_size
-
-            lcs = i*l_cols_tile+1
-            lce = min(l_cols,(i+1)*l_cols_tile)
-            if (lce<lcs) cycle
-
-            lre = min(l_rows,(i+1)*l_rows_tile)
-            call ZGEMM('C','N',lce-lcs+1,n_cols,lre,CONE,a(1,lcs),ubound(a,dim=1), &
-                         vmr,ubound(vmr,dim=1),CONE,umc(lcs,1),ubound(umc,dim=1))
-
-            if (i==0) cycle
-            lre = min(l_rows,i*l_rows_tile)
-            call ZGEMM('N','N',lre,n_cols,lce-lcs+1,CONE,a(1,lcs),lda, &
-                         umc(lcs,n_cols+1),ubound(umc,dim=1),CONE,vmr(1,n_cols+1),ubound(vmr,dim=1))
-          enddo
-        endif
-
-        ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
-        ! on the processors containing the diagonal
-        ! This is only necessary if ur has been calculated, i.e. if the
-        ! global tile size is smaller than the global remaining matrix
-
-        if (tile_size < istep*nbw) then
-          call elpa_reduce_add_vectors_complex  (vmr(1,n_cols+1),ubound(vmr,dim=1),mpi_comm_rows, &
-                                          umc, ubound(umc,dim=1), mpi_comm_cols, &
-                                          istep*nbw, n_cols, nblk)
-        endif
-
-        if (l_cols>0) then
-          allocate(tmp(l_cols,n_cols))
-          call mpi_allreduce(umc,tmp,l_cols*n_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
-          umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
-          deallocate(tmp)
-        endif
-
-        ! U = U * Tmat**T
-
-        call ztrmm('Right','Upper','C','Nonunit',l_cols,n_cols,CONE,tmat(1,1,istep),ubound(tmat,dim=1),umc,ubound(umc,dim=1))
-
-        ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
-
-        call zgemm('C','N',n_cols,n_cols,l_cols,CONE,umc,ubound(umc,dim=1),umc(1,n_cols+1), &
-            ubound(umc,dim=1),CZERO,vav,ubound(vav,dim=1))
-        call ztrmm('Right','Upper','C','Nonunit',n_cols,n_cols,CONE,tmat(1,1,istep),ubound(tmat,dim=1),vav,ubound(vav,dim=1))
-
-        call herm_matrix_allreduce(n_cols,vav, nbw,nbw,mpi_comm_cols)
-
-        ! U = U - 0.5 * V * VAV
-        call zgemm('N','N',l_cols,n_cols,n_cols,(-0.5d0,0.d0),umc(1,n_cols+1),ubound(umc,dim=1),vav,ubound(vav,dim=1), &
-            CONE,umc,ubound(umc,dim=1))
-
-        ! Transpose umc -> umr (stored in vmr, second half)
-
-        call elpa_transpose_vectors_complex  (umc, ubound(umc,dim=1), mpi_comm_cols, &
-                                       vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, &
-                                       1, istep*nbw, n_cols, nblk)
-
-        ! A = A - V*U**T - U*V**T
-
-        do i=0,(istep*nbw-1)/tile_size
-          lcs = i*l_cols_tile+1
-          lce = min(l_cols,(i+1)*l_cols_tile)
-          lre = min(l_rows,(i+1)*l_rows_tile)
-          if (lce<lcs .or. lre<1) cycle
-          call zgemm('N','C',lre,lce-lcs+1,2*n_cols,-CONE, &
-                      vmr,ubound(vmr,dim=1),umc(lcs,1),ubound(umc,dim=1), &
-                      CONE,a(1,lcs),lda)
-        enddo
-
-        deallocate(vmr, umc, vr)
-
-      enddo
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("bandred_complex")
-#endif
-
-    end subroutine bandred_complex
-
-    subroutine herm_matrix_allreduce(n,a,lda,ldb,comm)
-
-    !-------------------------------------------------------------------------------
-    !  herm_matrix_allreduce: Does an mpi_allreduce for a hermitian matrix A.
-    !  On entry, only the upper half of A needs to be set
-    !  On exit, the complete matrix is set
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-      integer(kind=ik) :: n, lda, ldb, comm
-      complex(kind=ck) :: a(lda,ldb)
-
-      integer(kind=ik) :: i, nc, mpierr
-      complex(kind=ck) :: h1(n*n), h2(n*n)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("herm_matrix_allreduce")
-#endif
-
-      nc = 0
-      do i=1,n
-        h1(nc+1:nc+i) = a(1:i,i)
-        nc = nc+i
-      enddo
-
-      call mpi_allreduce(h1,h2,nc,MPI_DOUBLE_COMPLEX,MPI_SUM,comm,mpierr)
-
-      nc = 0
-      do i=1,n
-        a(1:i,i) = h2(nc+1:nc+i)
-        a(i,1:i-1) = conjg(a(1:i-1,i))
-        nc = nc+i
-      enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("herm_matrix_allreduce")
-#endif
-
-    end subroutine herm_matrix_allreduce
-
-
-    subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ldq, matrixCols,  &
-                                             numBlocks, mpi_comm_rows, mpi_comm_cols)
-
-    !-------------------------------------------------------------------------------
-    !  trans_ev_band_to_full_complex:
-    !  Transforms the eigenvectors of a band matrix back to the eigenvectors of the original matrix
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix a, number of rows of matrix q
-    !
-    !  nqc         Number of columns of matrix q
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  nbw         semi bandwith
-    !
-    !  a(lda,matrixCols)    Matrix containing the Householder vectors (i.e. matrix a after bandred_complex)
-    !              Distribution is like in Scalapack.
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a and q
-    !
-    !  tmat(nbw,nbw,numBlocks) Factors returned by bandred_complex
-    !
-    !  q           On input: Eigenvectors of band matrix
-    !              On output: Transformed eigenvectors
-    !              Distribution is like in Scalapack.
-    !
-    !  ldq         Leading dimension of q
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)              :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      complex(kind=ck)              :: a(lda,*), q(ldq,*), tmat(nbw,nbw,*)
-#else
-      complex(kind=ck)              :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
-#endif
-      complex(kind=ck), parameter   :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
-
-      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)              :: max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
-      integer(kind=ik)              :: l_cols, l_rows, l_colh, n_cols
-      integer(kind=ik)              :: istep, lc, ncol, nrow, nb, ns
-
-      complex(kind=ck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
-
-      integer(kind=ik)              :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("trans_ev_band_to_full_complex")
-#endif
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-      max_blocks_row = ((na -1)/nblk)/np_rows + 1  ! Rows of A
-      max_blocks_col = ((nqc-1)/nblk)/np_cols + 1  ! Columns of q!
-
-      max_local_rows = max_blocks_row*nblk
-      max_local_cols = max_blocks_col*nblk
-
-      allocate(tmp1(max_local_cols*nbw))
-      allocate(tmp2(max_local_cols*nbw))
-      allocate(hvb(max_local_rows*nbw))
-      allocate(hvm(max_local_rows,nbw))
-
-      hvm = 0   ! Must be set to 0 !!!
-      hvb = 0   ! Safety only
-
-      l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
-
-      do istep=1,(na-1)/nbw
-
-        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
-
-        ! Broadcast all Householder vectors for current step compressed in hvb
-
-        nb = 0
-        ns = 0
-
-        do lc = 1, n_cols
-          ncol = istep*nbw + lc ! absolute column number of householder vector
-          nrow = ncol - nbw ! absolute number of pivot row
-
-          l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
-          l_colh = local_index(ncol  , my_pcol, np_cols, nblk, -1) ! HV local column number
-
-          if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)
-
-          nb = nb+l_rows
-
-          if (lc==n_cols .or. mod(ncol,nblk)==0) then
-            call MPI_Bcast(hvb(ns+1),nb-ns,MPI_DOUBLE_COMPLEX,pcol(ncol, nblk, np_cols),mpi_comm_cols,mpierr)
-            ns = nb
-          endif
-        enddo
-
-        ! Expand compressed Householder vectors into matrix hvm
-
-        nb = 0
-        do lc = 1, n_cols
-          nrow = (istep-1)*nbw+lc ! absolute number of pivot row
-          l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
-
-          hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
-          if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1.
-
-          nb = nb+l_rows
-        enddo
-
-        l_rows = local_index(MIN(na,(istep+1)*nbw), my_prow, np_rows, nblk, -1)
-
-        ! Q = Q - V * T**T * V**T * Q
-
-        if (l_rows>0) then
-          call zgemm('C','N',n_cols,l_cols,l_rows,CONE,hvm,ubound(hvm,dim=1), &
-                      q,ldq,CZERO,tmp1,n_cols)
-        else
-          tmp1(1:l_cols*n_cols) = 0
-        endif
-        call mpi_allreduce(tmp1,tmp2,n_cols*l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
-        if (l_rows>0) then
-          call ztrmm('L','U','C','N',n_cols,l_cols,CONE,tmat(1,1,istep),ubound(tmat,dim=1),tmp2,n_cols)
-          call zgemm('N','N',l_rows,l_cols,n_cols,-CONE,hvm,ubound(hvm,dim=1), &
-                      tmp2,n_cols,CONE,q,ldq)
-        endif
-
-      enddo
-
-      deallocate(tmp1, tmp2, hvb, hvm)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("trans_ev_band_to_full_complex")
-#endif
-
-    end subroutine trans_ev_band_to_full_complex
-
-
-    subroutine tridiag_band_complex(na, nb, nblk, a, lda, d, e, matrixCols, hh_trans_complex, &
-                                    mpi_comm_rows, mpi_comm_cols, mpi_comm)
-
-    !-------------------------------------------------------------------------------
-    ! tridiag_band_complex:
-    ! Reduces a complex hermitian symmetric band matrix to tridiagonal form
-    !
-    !  na          Order of matrix a
-    !
-    !  nb          Semi bandwith
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  a(lda,matrixCols)    Distributed system matrix reduced to banded form in the upper diagonal
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a
-    !
-    !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0 (output)
-    !
-    !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !  mpi_comm
-    !              MPI-Communicator for the total processor set
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik), intent(in)   ::  na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      complex(kind=ck),intent(in)    :: a(lda,*)
-#else
-      complex(kind=ck), intent(in)   :: a(lda,matrixCols)
-#endif
-      real(kind=rk), intent(out)     :: d(na), e(na) ! set only on PE 0
-      complex(kind=ck), intent(inout), &
-          allocatable                :: hh_trans_complex(:,:)
-
-      real(kind=rk)                  :: vnorm2
-      complex(kind=ck)               :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
-      complex(kind=ck)               :: hd(nb), hs(nb)
-
-      integer(kind=ik)               :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
-      integer(kind=ik)               :: my_pe, n_pes, mpierr
-      integer(kind=ik)               :: my_prow, np_rows, my_pcol, np_cols
-      integer(kind=ik)               :: ireq_ab, ireq_hv
-      integer(kind=ik)               :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
-#ifdef WITH_OPENMP
-      integer(kind=ik), allocatable  :: mpi_statuses(:,:)
-      integer(kind=ik), allocatable  :: omp_block_limits(:)
-      integer(kind=ik)               :: max_threads, my_thread, my_block_s, my_block_e, iter
-      integer(kind=ik)               :: omp_get_max_threads
-      integer(kind=ik)               :: mpi_status(MPI_STATUS_SIZE)
-      complex(kind=ck), allocatable  :: hv_t(:,:), tau_t(:)
-#endif
-      integer(kind=ik), allocatable  :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:)
-      integer(kind=ik), allocatable  :: limits(:), snd_limits(:,:)
-      integer(kind=ik), allocatable  :: block_limits(:)
-      complex(kind=ck), allocatable  :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
-!   ! dummies for calling redist_band
-!   real*8                   :: r_a(1,1), r_ab(1,1)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("tridiag_band_complex")
-#endif
-      call mpi_comm_rank(mpi_comm,my_pe,mpierr)
-      call mpi_comm_size(mpi_comm,n_pes,mpierr)
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-      ! Get global_id mapping 2D procssor coordinates to global id
-
-      allocate(global_id(0:np_rows-1,0:np_cols-1))
-      global_id(:,:) = 0
-      global_id(my_prow, my_pcol) = my_pe
-
-      call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr)
-
-
-      ! Total number of blocks in the band:
-
-      nblocks_total = (na-1)/nb + 1
-
-      ! Set work distribution
-
-      allocate(block_limits(0:n_pes))
-      call divide_band(nblocks_total, n_pes, block_limits)
-
-      ! nblocks: the number of blocks for my task
-      nblocks = block_limits(my_pe+1) - block_limits(my_pe)
-
-      ! allocate the part of the band matrix which is needed by this PE
-      ! The size is 1 block larger than needed to avoid extensive shifts
-      allocate(ab(2*nb,(nblocks+1)*nb))
-      ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety
-
-      ! n_off: Offset of ab within band
-      n_off = block_limits(my_pe)*nb
-
-      ! Redistribute band in a to ab
-      call redist_band_complex(a, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab)
-
-      ! Calculate the workload for each sweep in the back transformation
-      ! and the space requirements to hold the HH vectors
-
-      allocate(limits(0:np_rows))
-      call determine_workload(na, nb, np_rows, limits)
-      max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
-
-      num_hh_vecs = 0
-      num_chunks  = 0
-      nx = na
-      do n = 1, nblocks_total
-        call determine_workload(nx, nb, np_rows, limits)
-        local_size = limits(my_prow+1) - limits(my_prow)
-        ! add to number of householder vectors
-        ! please note: for nx==1 the one and only HH vector is 0 and is neither calculated nor send below!
-        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
-          num_hh_vecs = num_hh_vecs + local_size
-          num_chunks  = num_chunks+1
-        endif
-        nx = nx - nb
-      enddo
-
-      ! Allocate space for HH vectors
-
-      allocate(hh_trans_complex(nb,num_hh_vecs))
-
-      ! Allocate and init MPI requests
-
-      allocate(ireq_hhr(num_chunks)) ! Recv requests
-      allocate(ireq_hhs(nblocks))    ! Send requests
-
-      num_hh_vecs = 0
-      num_chunks  = 0
-      nx = na
-      nt = 0
-      do n = 1, nblocks_total
-        call determine_workload(nx, nb, np_rows, limits)
-        local_size = limits(my_prow+1) - limits(my_prow)
-        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
-          num_chunks  = num_chunks+1
-          call mpi_irecv(hh_trans_complex(1,num_hh_vecs+1), nb*local_size, MPI_COMPLEX16, nt, &
-                           10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr)
-          num_hh_vecs = num_hh_vecs + local_size
-        endif
-        nx = nx - nb
-        if (n == block_limits(nt+1)) then
-          nt = nt + 1
-        endif
-      enddo
-
-      ireq_hhs(:) = MPI_REQUEST_NULL
-
-      ! Buffers for gathering/sending the HH vectors
-
-      allocate(hh_gath(nb,max_blk_size,nblocks)) ! gathers HH vectors
-      allocate(hh_send(nb,max_blk_size,nblocks)) ! send buffer for HH vectors
-      hh_gath(:,:,:) = 0
-      hh_send(:,:,:) = 0
-
-      ! Some counters
-
-      allocate(hh_cnt(nblocks))
-      allocate(hh_dst(nblocks))
-
-      hh_cnt(:) = 1 ! The first transfomation vector is always 0 and not calculated at all
-      hh_dst(:) = 0 ! PE number for receive
-
-      ireq_ab = MPI_REQUEST_NULL
-      ireq_hv = MPI_REQUEST_NULL
-
-      ! Limits for sending
-
-      allocate(snd_limits(0:np_rows,nblocks))
-
-      do iblk=1,nblocks
-        call determine_workload(na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
-      enddo
-
-#ifdef WITH_OPENMP
-      ! OpenMP work distribution:
-
-       max_threads = 1
-!$ max_threads = omp_get_max_threads()
-
-       ! For OpenMP we need at least 2 blocks for every thread
-       max_threads = MIN(max_threads, nblocks/2)
-       if (max_threads==0) max_threads = 1
-
-       allocate(omp_block_limits(0:max_threads))
-
-       ! Get the OpenMP block limits
-       call divide_band(nblocks, max_threads, omp_block_limits)
-
-       allocate(hv_t(nb,max_threads), tau_t(max_threads))
-       hv_t = 0
-       tau_t = 0
-#endif
-
-
-       ! ---------------------------------------------------------------------------
-       ! Start of calculations
-
-       na_s = block_limits(my_pe)*nb + 1
-
-       if (my_pe>0 .and. na_s<=na) then
-         ! send first column to previous PE
-         ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
-         ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
-         call mpi_isend(ab_s,nb+1,MPI_COMPLEX16,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
-       endif
-
-#ifdef WITH_OPENMP
-       do istep=1,na-1-block_limits(my_pe)*nb
-#else
-       do istep=1,na-1
-#endif
-         if (my_pe==0) then
-           n = MIN(na-na_s,nb) ! number of rows to be reduced
-           hv(:) = 0
-           tau = 0
-           ! Transform first column of remaining matrix
-           ! Opposed to the real case, the last step (istep=na-1) is needed here for making
-           ! the last subdiagonal element a real number
-           vnorm2 = sum(dble(ab(3:n+1,na_s-n_off))**2+dimag(ab(3:n+1,na_s-n_off))**2)
-           if (n<2) vnorm2 = 0. ! Safety only
-           call hh_transform_complex(ab(2,na_s-n_off),vnorm2,hf,tau)
-
-           hv(1) = 1
-           hv(2:n) = ab(3:n+1,na_s-n_off)*hf
-
-           d(istep) = ab(1,na_s-n_off)
-           e(istep) = ab(2,na_s-n_off)
-           if (istep == na-1) then
-             d(na) = ab(1,na_s+1-n_off)
-             e(na) = 0
-           endif
-         else
-           if (na>na_s) then
-             ! Receive Householder vector from previous task, from PE owning subdiagonal
-#ifdef WITH_OPENMP
-             call mpi_recv(hv,nb,MPI_COMPLEX16,my_pe-1,2,mpi_comm,mpi_status,mpierr)
-#else
-             call mpi_recv(hv,nb,MPI_COMPLEX16,my_pe-1,2,mpi_comm,MPI_STATUS_IGNORE,mpierr)
-#endif
-             tau = hv(1)
-             hv(1) = 1.
-           endif
-         endif
-
-         na_s = na_s+1
-         if (na_s-n_off > nb) then
-           ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
-           ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0
-           n_off = n_off + nb
-         endif
-#ifdef WITH_OPENMP
-         if (max_threads > 1) then
-
-           ! Codepath for OpenMP
-
-           ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread!
-           ! Every thread is one reduction cycle behind its predecessor and thus starts one step later.
-           ! This simulates the behaviour of the MPI tasks which also work after each other.
-           ! The code would be considerably easier, if the MPI communication would be made within
-           ! the parallel region - this is avoided here since this would require
-           ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program.
-
-           hv_t(:,1) = hv
-           tau_t(1) = tau
-
-           do iter = 1, 2
-
-             ! iter=1 : work on first block
-             ! iter=2 : work on remaining blocks
-             ! This is done in 2 iterations so that we have a barrier in between:
-             ! After the first iteration, it is guaranteed that the last row of the last block
-             ! is completed by the next thread.
-             ! After the first iteration it is also the place to exchange the last row
-             ! with MPI calls
-#ifdef HAVE_DETAILED_TIMINGS
-             call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
-!$omp&                    nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
-             do my_thread = 1, max_threads
-
-               if (iter == 1) then
-                 my_block_s = omp_block_limits(my_thread-1) + 1
-                 my_block_e = my_block_s
-               else
-                 my_block_s = omp_block_limits(my_thread-1) + 2
-                 my_block_e = omp_block_limits(my_thread)
-               endif
-
-               do iblk = my_block_s, my_block_e
-
-                 ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block
-                 ne = ns+nb-1                    ! last column in block
-
-                 if (istep<my_thread .or. ns+n_off>na) exit
-
-                 hv = hv_t(:,my_thread)
-                 tau = tau_t(my_thread)
-
-                 ! Store Householder vector for back transformation
-
-                 hh_cnt(iblk) = hh_cnt(iblk) + 1
-
-                 hh_gath(1   ,hh_cnt(iblk),iblk) = tau
-                 hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
-
-                 nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
-                 nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
-                                                ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
-
-                 ! Transform diagonal block
-
-                 call ZHEMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,(0.d0,0.d0),hd,1)
-
-                 x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
-                 hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc)
-
-                 call ZHER2('L',nc,(-1.d0,0.d0),hd,1,hv,1,ab(1,ns),2*nb-1)
-
-                 hv_t(:,my_thread) = 0
-                 tau_t(my_thread)  = 0
-
-                 if (nr<=0) cycle ! No subdiagonal block present any more
-
-                 ! Transform subdiagonal block
-
-                 call ZGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,(0.d0,0.d0),hs,1)
-
-                 if (nr>1) then
-
-                   ! complete (old) Householder transformation for first column
-
-                   ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
-
-                   ! calculate new Householder transformation for first column
-                   ! (stored in hv_t(:,my_thread) and tau_t(my_thread))
-
-                   vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2)
-                   call hh_transform_complex(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread))
-                   hv_t(1   ,my_thread) = 1.
-                   hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
-                   ab(nb+2:,ns) = 0
-
-                   ! update subdiagonal block for old and new Householder transformation
-                   ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
-
-                   call ZGEMV('C',nr,nb-1,tau_t(my_thread),ab(nb,ns+1),2*nb-1,hv_t(1,my_thread),1,(0.d0,0.d0),h(2),1)
-                   x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
-                   h(2:nb) = h(2:nb) - x*hv(2:nb)
-                   ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
-                   do i=2,nb
-                     ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) &
-                                                    - hv_t(1:nr,my_thread)*conjg(h(i)) - hs(1:nr)*conjg(hv(i))
-                   enddo
-
-                 else
-
-                   ! No new Householder transformation for nr=1, just complete the old one
-                   ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1
-                   do i=2,nb
-                     ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
-                   enddo
-                   ! For safety: there is one remaining dummy transformation (but tau is 0 anyways)
-                   hv_t(1,my_thread) = 1.
-
-                 endif
-
-               enddo
-
-             enddo ! my_thread
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-             call timer%stop("OpenMP parallel")
-#endif
-
-             if (iter==1) then
-               ! We are at the end of the first block
-
-               ! Send our first column to previous PE
-               if (my_pe>0 .and. na_s <= na) then
-                 call mpi_wait(ireq_ab,mpi_status,mpierr)
-                 ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
-                 call mpi_isend(ab_s,nb+1,MPI_COMPLEX16,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
-               endif
-
-               ! Request last column from next PE
-               ne = na_s + nblocks*nb - (max_threads-1) - 1
-               if (istep>=max_threads .and. ne <= na) then
-                 call mpi_recv(ab(1,ne-n_off),nb+1,MPI_COMPLEX16,my_pe+1,1,mpi_comm,mpi_status,mpierr)
-               endif
-
-             else
-               ! We are at the end of all blocks
-
-               ! Send last HH vector and TAU to next PE if it has been calculated above
-               ne = na_s + nblocks*nb - (max_threads-1) - 1
-               if (istep>=max_threads .and. ne < na) then
-                 call mpi_wait(ireq_hv,mpi_status,mpierr)
-                 hv_s(1) = tau_t(max_threads)
-                 hv_s(2:) = hv_t(2:,max_threads)
-                 call mpi_isend(hv_s,nb,MPI_COMPLEX16,my_pe+1,2,mpi_comm,ireq_hv,mpierr)
-               endif
-
-               ! "Send" HH vector and TAU to next OpenMP thread
-               do my_thread = max_threads, 2, -1
-                 hv_t(:,my_thread) = hv_t(:,my_thread-1)
-                 tau_t(my_thread)  = tau_t(my_thread-1)
-               enddo
-
-             endif
-           enddo ! iter
-
-         else
-
-           ! Codepath for 1 thread without OpenMP
-
-           ! The following code is structured in a way to keep waiting times for
-           ! other PEs at a minimum, especially if there is only one block.
-           ! For this reason, it requests the last column as late as possible
-           ! and sends the Householder vector and the first column as early
-           ! as possible.
-
-#endif
-
-           do iblk=1,nblocks
-
-             ns = na_s + (iblk-1)*nb - n_off ! first column in block
-             ne = ns+nb-1                    ! last column in block
-
-             if (ns+n_off>na) exit
-
-             ! Store Householder vector for back transformation
-
-             hh_cnt(iblk) = hh_cnt(iblk) + 1
-
-             hh_gath(1   ,hh_cnt(iblk),iblk) = tau
-             hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
-
-
-#ifndef WITH_OPENMP
-             if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
-               ! Wait for last transfer to finish
-               call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
-               ! Copy vectors into send buffer
-               hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
-               ! Send to destination
-               call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), MPI_COMPLEX16, &
-                               global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), &
-                               10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
-               ! Reset counter and increase destination row
-               hh_cnt(iblk) = 0
-               hh_dst(iblk) = hh_dst(iblk)+1
-             endif
-
-
-             ! The following code is structured in a way to keep waiting times for
-             ! other PEs at a minimum, especially if there is only one block.
-             ! For this reason, it requests the last column as late as possible
-             ! and sends the Householder vector and the first column as early
-             ! as possible.
-#endif
-
-             nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
-             nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
-                                           ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
-
-
-             ! Multiply diagonal block and subdiagonal block with Householder vector
-
-             if (iblk==nblocks .and. nc==nb) then
-
-               ! We need the last column from the next PE.
-               ! First do the matrix multiplications without last column ...
-
-               ! Diagonal block, the contribution of the last element is added below!
-               ab(1,ne) = 0
-               call ZHEMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,(0.d0,0.d0),hd,1)
-
-               ! Subdiagonal block
-               if (nr>0) call ZGEMV('N',nr,nb-1,tau,ab(nb+1,ns),2*nb-1,hv,1,(0.d0,0.d0),hs,1)
-
-               ! ... then request last column ...
-#ifdef WITH_OPENMP
-               call mpi_recv(ab(1,ne),nb+1,MPI_COMPLEX16,my_pe+1,1,mpi_comm,mpi_status,mpierr)
-
-#else
-               call mpi_recv(ab(1,ne),nb+1,MPI_COMPLEX16,my_pe+1,1,mpi_comm,MPI_STATUS_IGNORE,mpierr)
-#endif
-               ! ... and complete the result
-               hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb)
-               hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau
-
-             else
-               ! Normal matrix multiply
-               call ZHEMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,(0.d0,0.d0),hd,1)
-               if (nr>0) call ZGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,(0.d0,0.d0),hs,1)
-
-             endif
-
-             ! Calculate first column of subdiagonal block and calculate new
-             ! Householder transformation for this column
-
-             hv_new(:) = 0 ! Needed, last rows must be 0 for nr < nb
-             tau_new = 0
-
-             if (nr>0) then
-
-               ! complete (old) Householder transformation for first column
-
-               ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
-
-               ! calculate new Householder transformation ...
-               if (nr>1) then
-                 vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2)
-                 call hh_transform_complex(ab(nb+1,ns),vnorm2,hf,tau_new)
-                 hv_new(1) = 1.
-                 hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf
-                 ab(nb+2:,ns) = 0
-               endif
-
-              ! ... and send it away immediatly if this is the last block
-
-              if (iblk==nblocks) then
-#ifdef WITH_OPENMP
-                call mpi_wait(ireq_hv,mpi_status,mpierr)
-#else
-                call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
-#endif
-                hv_s(1) = tau_new
-                hv_s(2:) = hv_new(2:)
-                call mpi_isend(hv_s,nb,MPI_COMPLEX16,my_pe+1,2,mpi_comm,ireq_hv,mpierr)
-              endif
-
-            endif
-
-
-            ! Transform diagonal block
-            x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
-            hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc)
-
-            if (my_pe>0 .and. iblk==1) then
-
-              ! The first column of the diagonal block has to be send to the previous PE
-              ! Calculate first column only ...
-
-              ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*conjg(hv(1)) - hv(1:nc)*conjg(hd(1))
-
-              ! ... send it away ...
-#ifdef WITH_OPENMP
-              call mpi_wait(ireq_ab,mpi_status,mpierr)
-#else
-              call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
-#endif
-              ab_s(1:nb+1) = ab(1:nb+1,ns)
-              call mpi_isend(ab_s,nb+1,MPI_COMPLEX16,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
-
-              ! ... and calculate remaining columns with rank-2 update
-              if (nc>1) call ZHER2('L',nc-1,(-1.d0,0.d0),hd(2),1,hv(2),1,ab(1,ns+1),2*nb-1)
-            else
-              ! No need to  send, just a rank-2 update
-              call ZHER2('L',nc,(-1.d0,0.d0),hd,1,hv,1,ab(1,ns),2*nb-1)
-            endif
-
-            ! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb
-
-            if (nr>0) then
-              if (nr>1) then
-                call ZGEMV('C',nr,nb-1,tau_new,ab(nb,ns+1),2*nb-1,hv_new,1,(0.d0,0.d0),h(2),1)
-                x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new
-                h(2:nb) = h(2:nb) - x*hv(2:nb)
-                ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update
-                do i=2,nb
-                  ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*conjg(h(i)) - hs(1:nr)*conjg(hv(i))
-                enddo
-              else
-                ! No double Householder transformation for nr=1, just complete the row
-                do i=2,nb
-                   ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
-                enddo
-              endif
-            endif
-
-            ! Use new HH vector for the next block
-            hv(:) = hv_new(:)
-            tau = tau_new
-
-          enddo
-#ifdef WITH_OPENMP
-        endif
-#endif
-
-#ifdef WITH_OPENMP
-        do iblk = 1, nblocks
-
-          if (hh_dst(iblk) >= np_rows) exit
-          if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit
-
-          if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
-            ! Wait for last transfer to finish
-            call mpi_wait(ireq_hhs(iblk), mpi_status, mpierr)
-            ! Copy vectors into send buffer
-            hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
-            ! Send to destination
-            call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_complex16, &
-                          global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), &
-                          10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
-            ! Reset counter and increase destination row
-            hh_cnt(iblk) = 0
-            hh_dst(iblk) = hh_dst(iblk)+1
-          endif
-        enddo
-#endif
-      enddo
-
-      ! Finish the last outstanding requests
-#ifdef WITH_OPENMP
-      call mpi_wait(ireq_ab,mpi_status,mpierr)
-      call mpi_wait(ireq_hv,mpi_status,mpierr)
-
-      allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks)))
-      call mpi_waitall(nblocks, ireq_hhs, mpi_statuses, mpierr)
-      call mpi_waitall(num_chunks, ireq_hhr, mpi_statuses, mpierr)
-      deallocate(mpi_statuses)
-#else
-      call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
-      call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
-
-      call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr)
-      call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr)
-
-#endif
-      call mpi_barrier(mpi_comm,mpierr)
-
-      deallocate(ab)
-      deallocate(ireq_hhr, ireq_hhs)
-      deallocate(hh_cnt, hh_dst)
-      deallocate(hh_gath, hh_send)
-      deallocate(limits, snd_limits)
-      deallocate(block_limits)
-      deallocate(global_id)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("tridiag_band_complex")
-#endif
-
-    end subroutine tridiag_band_complex
-
-    subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, matrixCols,  &
-                                              hh_trans_complex, mpi_comm_rows, mpi_comm_cols, &
-                                              wantDebug, success, THIS_COMPLEX_ELPA_KERNEL)
-
-    !-------------------------------------------------------------------------------
-    !  trans_ev_tridi_to_band_complex:
-    !  Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix a, number of rows of matrix q
-    !
-    !  nev         Number eigenvectors to compute (= columns of matrix q)
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  nb          semi bandwith
-    !
-    !  q           On input: Eigenvectors of tridiagonal matrix
-    !              On output: Transformed eigenvectors
-    !              Distribution is like in Scalapack.
-    !
-    !  ldq         Leading dimension of q
-    ! matrixCols   local columns of matrix q
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns/both
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use pack_unpack_complex
-      use compute_hh_trafo_complex
-      use precision
-      implicit none
-
-      integer(kind=ik), intent(in)  :: THIS_COMPLEX_ELPA_KERNEL
-      integer(kind=ik), intent(in)  :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      complex(kind=ck)              :: q(ldq,*)
-#else
-      complex(kind=ck)              :: q(ldq,matrixCols)
-#endif
-      complex(kind=ck)              :: hh_trans_complex(:,:)
-      integer(kind=ik)              :: np_rows, my_prow, np_cols, my_pcol
-
-      integer(kind=ik)              :: i, j, ip, sweep, nbuf, l_nev, a_dim2
-      integer(kind=ik)              :: current_n, current_local_n, current_n_start, current_n_end
-      integer(kind=ik)              :: next_n, next_local_n, next_n_start, next_n_end
-      integer(kind=ik)              :: bottom_msg_length, top_msg_length, next_top_msg_length
-      integer(kind=ik)              :: stripe_width, last_stripe_width, stripe_count
-#ifdef WITH_OPENMP
-      integer(kind=ik)              :: thread_width, csw, b_off, b_len
-#endif
-      integer(kind=ik)              :: num_result_blocks, num_result_buffers, num_bufs_recvd
-      integer(kind=ik)              :: a_off, current_tv_off, max_blk_size
-      integer(kind=ik)              :: mpierr, src, src_offset, dst, offset, nfact, num_blk
-      logical                       :: flag
-
-#ifdef WITH_OPENMP
-      complex(kind=ck), allocatable :: a(:,:,:,:), row(:)
-#else
-      complex(kind=ck), allocatable :: a(:,:,:), row(:)
-#endif
-#ifdef WITH_OPENMP
-      complex(kind=ck), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
-      complex(kind=ck), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
-#else
-      complex(kind=ck), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
-      complex(kind=ck), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
-#endif
-      complex(kind=ck), allocatable :: result_buffer(:,:,:)
-      complex(kind=ck), allocatable :: bcast_buffer(:,:)
-
-      integer(kind=ik)              :: n_off
-      integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:)
-      integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:)
-      integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:)
-#ifdef WITH_OPENMP
-      integer(kind=ik), allocatable :: mpi_statuses(:,:)
-      integer(kind=ik)              :: mpi_status(MPI_STATUS_SIZE)
-#endif
-
-      ! MPI send/recv tags, arbitrary
-
-      integer(kind=ik), parameter   :: bottom_recv_tag = 111
-      integer(kind=ik), parameter   :: top_recv_tag    = 222
-      integer(kind=ik), parameter   :: result_recv_tag = 333
-
-#ifdef WITH_OPENMP
-      integer(kind=ik)              :: max_threads, my_thread
-      integer(kind=ik)              :: omp_get_max_threads
-#endif
-
-      ! Just for measuring the kernel performance
-      real(kind=rk)                 :: kernel_time
-      ! long integer
-      integer(kind=lik)             :: kernel_flops
-
-      logical, intent(in)           :: wantDebug
-      logical                       :: success
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("trans_ev_tridi_to_band_complex")
-#endif
-
-      kernel_time = 1.d-100
-      kernel_flops = 0
-
-#ifdef WITH_OPENMP
-      max_threads = 1
-      max_threads = omp_get_max_threads()
-#endif
-
-      call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr)
-      call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr)
-      call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr)
-      call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr)
-
-      success = .true.
-
-      if (mod(nbw,nblk)/=0) then
-        if (my_prow==0 .and. my_pcol==0) then
-          if (wantDebug) then
-            write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: ERROR: nbw=',nbw,', nblk=',nblk
-            write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: band backtransform works only for nbw==n*nblk'
-          endif
-
-          success = .false.
-          return
-        endif
-      endif
-
-      nfact = nbw / nblk
-
-
-      ! local number of eigenvectors
-      l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)
-
-      if (l_nev==0) then
-#ifdef WITH_OPENMP
-        thread_width = 0
-#endif
-        stripe_width = 0
-        stripe_count = 0
-        last_stripe_width = 0
-      else
-        ! Suggested stripe width is 48 - should this be reduced for the complex case ???
-#ifdef WITH_OPENMP
-        thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
-#endif
-
-        stripe_width = 48 ! Must be a multiple of 4
-#ifdef WITH_OPENMP
-        stripe_count = (thread_width-1)/stripe_width + 1
-#else
-        stripe_count = (l_nev-1)/stripe_width + 1
-#endif
-        ! Adapt stripe width so that last one doesn't get too small
-#ifdef WITH_OPENMP
-        stripe_width = (thread_width-1)/stripe_count + 1
-#else
-        stripe_width = (l_nev-1)/stripe_count + 1
-#endif
-        stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 !!!
-#ifndef WITH_OPENMP
-        last_stripe_width = l_nev - (stripe_count-1)*stripe_width
-#endif
-      endif
-
-      ! Determine the matrix distribution at the beginning
-
-      allocate(limits(0:np_rows))
-
-      call determine_workload(na, nbw, np_rows, limits)
-
-      max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
-
-      a_dim2 = max_blk_size + nbw
-
-!DEC$ ATTRIBUTES ALIGN: 64:: a
-#ifdef WITH_OPENMP
-      allocate(a(stripe_width,a_dim2,stripe_count,max_threads))
-      ! a(:,:,:,:) should be set to 0 in a parallel region, not here!
-#else
-      allocate(a(stripe_width,a_dim2,stripe_count))
-      a(:,:,:) = 0
-#endif
-
-      allocate(row(l_nev))
-      row(:) = 0
-
-      ! Copy q from a block cyclic distribution into a distribution with contiguous rows,
-      ! and transpose the matrix using stripes of given stripe_width for cache blocking.
-
-      ! The peculiar way it is done below is due to the fact that the last row should be
-      ! ready first since it is the first one to start below
-
-#ifdef WITH_OPENMP
-      ! Please note about the OMP usage below:
-      ! This is not for speed, but because we want the matrix a in the memory and
-      ! in the cache of the correct thread (if possible)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread), schedule(static, 1)
-      do my_thread = 1, max_threads
-        a(:,:,:,my_thread) = 0 ! if possible, do first touch allocation!
-      enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("OpenMP parallel")
-#endif
-
-#endif
-      do ip = np_rows-1, 0, -1
-        if (my_prow == ip) then
-          ! Receive my rows which have not yet been received
-          src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
-          do i=limits(ip)+1,limits(ip+1)
-            src = mod((i-1)/nblk, np_rows)
-            if (src < my_prow) then
-#ifdef WITH_OPENMP
-              call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, mpi_status, mpierr)
-
-#else
-              call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
-#endif
-
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread), schedule(static, 1)
-              do my_thread = 1, max_threads
-                call unpack_row_complex_cpu_openmp(a, row,i-limits(ip),my_thread, &
-                                                   stripe_count, thread_width, stripe_width, l_nev)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else
-              call unpack_row_complex_cpu(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
-#endif
-            elseif (src==my_prow) then
-              src_offset = src_offset+1
-              row(:) = q(src_offset, 1:l_nev)
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread), schedule(static, 1)
-              do my_thread = 1, max_threads
-                call unpack_row_complex_cpu_openmp(a, row,i-limits(ip),my_thread, &
-                                                   stripe_count, thread_width, stripe_width, l_nev)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else
-              call unpack_row_complex_cpu(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
-#endif
-            endif
-          enddo
-          ! Send all rows which have not yet been send
-          src_offset = 0
-          do dst = 0, ip-1
-            do i=limits(dst)+1,limits(dst+1)
-              if(mod((i-1)/nblk, np_rows) == my_prow) then
-                  src_offset = src_offset+1
-                  row(:) = q(src_offset, 1:l_nev)
-                  call MPI_Send(row, l_nev, MPI_COMPLEX16, dst, 0, mpi_comm_rows, mpierr)
-              endif
-            enddo
-          enddo
-        else if(my_prow < ip) then
-          ! Send all rows going to PE ip
-          src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
-          do i=limits(ip)+1,limits(ip+1)
-            src = mod((i-1)/nblk, np_rows)
-            if (src == my_prow) then
-              src_offset = src_offset+1
-              row(:) = q(src_offset, 1:l_nev)
-              call MPI_Send(row, l_nev, MPI_COMPLEX16, ip, 0, mpi_comm_rows, mpierr)
-            endif
-          enddo
-          ! Receive all rows from PE ip
-          do i=limits(my_prow)+1,limits(my_prow+1)
-            src = mod((i-1)/nblk, np_rows)
-            if (src == ip) then
-#ifdef WITH_OPENMP
-              call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, mpi_status, mpierr)
-#else
-              call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
-#endif
-
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-!$omp parallel do private(my_thread), schedule(static, 1)
-              do my_thread = 1, max_threads
-                call unpack_row_complex_cpu_openmp(a, row,i-limits(my_prow),my_thread, &
-                                                   stripe_count, thread_width, stripe_width, l_nev)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else
-              call unpack_row_complex_cpu(a, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
-#endif
-            endif
-          enddo
-        endif
-      enddo
-
-
-      ! Set up result buffer queue
-
-      num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows
-
-      num_result_buffers = 4*nfact
-      allocate(result_buffer(l_nev,nblk,num_result_buffers))
-
-      allocate(result_send_request(num_result_buffers))
-      allocate(result_recv_request(num_result_buffers))
-      result_send_request(:) = MPI_REQUEST_NULL
-      result_recv_request(:) = MPI_REQUEST_NULL
-
-      ! Queue up buffers
-
-      if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
-        do j = 1, min(num_result_buffers, num_result_blocks)
-          call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_COMPLEX16, 0, result_recv_tag, &
-                             mpi_comm_rows, result_recv_request(j), mpierr)
-        enddo
-      endif
-
-      num_bufs_recvd = 0 ! No buffers received yet
-
-      ! Initialize top/bottom requests
-
-      allocate(top_send_request(stripe_count))
-      allocate(top_recv_request(stripe_count))
-      allocate(bottom_send_request(stripe_count))
-      allocate(bottom_recv_request(stripe_count))
-
-      top_send_request(:) = MPI_REQUEST_NULL
-      top_recv_request(:) = MPI_REQUEST_NULL
-      bottom_send_request(:) = MPI_REQUEST_NULL
-      bottom_recv_request(:) = MPI_REQUEST_NULL
-
-#ifdef WITH_OPENMP
-      allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count))
-      allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count))
-      allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count))
-      allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count))
-
-      top_border_send_buffer(:,:) = 0
-      top_border_recv_buffer(:,:) = 0
-      bottom_border_send_buffer(:,:) = 0
-      bottom_border_recv_buffer(:,:) = 0
-#else
-      allocate(top_border_send_buffer(stripe_width, nbw, stripe_count))
-      allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count))
-      allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count))
-      allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count))
-
-      top_border_send_buffer(:,:,:) = 0
-      top_border_recv_buffer(:,:,:) = 0
-      bottom_border_send_buffer(:,:,:) = 0
-      bottom_border_recv_buffer(:,:,:) = 0
-#endif
-
-      ! Initialize broadcast buffer
-
-      allocate(bcast_buffer(nbw, max_blk_size))
-      bcast_buffer = 0
-
-      current_tv_off = 0 ! Offset of next row to be broadcast
-
-
-      ! ------------------- start of work loop -------------------
-
-      a_off = 0 ! offset in A (to avoid unnecessary shifts)
-
-      top_msg_length = 0
-      bottom_msg_length = 0
-
-      do sweep = 0, (na-1)/nbw
-
-        current_n = na - sweep*nbw
-        call determine_workload(current_n, nbw, np_rows, limits)
-        current_n_start = limits(my_prow)
-        current_n_end   = limits(my_prow+1)
-        current_local_n = current_n_end - current_n_start
-
-        next_n = max(current_n - nbw, 0)
-        call determine_workload(next_n, nbw, np_rows, limits)
-        next_n_start = limits(my_prow)
-        next_n_end   = limits(my_prow+1)
-        next_local_n = next_n_end - next_n_start
-
-        if (next_n_end < next_n) then
-          bottom_msg_length = current_n_end - next_n_end
-        else
-          bottom_msg_length = 0
-        endif
-
-        if (next_local_n > 0) then
-          next_top_msg_length = current_n_start - next_n_start
-        else
-          next_top_msg_length = 0
-        endif
-
-        if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
-          do i = 1, stripe_count
-#ifdef WITH_OPENMP
-            csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width"
-            b_len = csw*nbw*max_threads
-            call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, &
-                     mpi_comm_rows, bottom_recv_request(i), mpierr)
-#else
-            call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, &
-                         mpi_comm_rows, bottom_recv_request(i), mpierr)
-#endif
-          enddo
-        endif
-
-        if (current_local_n > 1) then
-          if (my_pcol == mod(sweep,np_cols)) then
-            bcast_buffer(:,1:current_local_n) = hh_trans_complex(:,current_tv_off+1:current_tv_off+current_local_n)
-            current_tv_off = current_tv_off + current_local_n
-          endif
-          call mpi_bcast(bcast_buffer, nbw*current_local_n, MPI_COMPLEX16, mod(sweep,np_cols), mpi_comm_cols, mpierr)
-         else
-           ! for current_local_n == 1 the one and only HH vector is 0 and not stored in hh_trans_complex
-           bcast_buffer(:,1) = 0
-         endif
-
-         if (l_nev == 0) cycle
-
-         if (current_local_n > 0) then
-
-           do i = 1, stripe_count
-
-#ifdef WITH_OPENMP
-             ! Get real stripe width for strip i;
-             ! The last OpenMP tasks may have an even smaller stripe with,
-             ! but we don't care about this, i.e. we send/recv a bit too much in this case.
-             ! csw: current_stripe_width
-
-             csw = min(stripe_width, thread_width-(i-1)*stripe_width)
-#endif
-
-             !wait_b
-             if (current_n_end < current_n) then
-#ifdef WITH_OPENMP
-               call MPI_Wait(bottom_recv_request(i), mpi_status, mpierr)
-#else
-               call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%start("OpenMP parallel")
-#endif
-!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
-               do my_thread = 1, max_threads
-                 n_off = current_local_n+a_off
-                 b_len = csw*nbw
-                 b_off = (my_thread-1)*b_len
-                 a(1:csw,n_off+1:n_off+nbw,i,my_thread) = &
-                    reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /))
-               enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%stop("OpenMP parallel")
-#endif
-
-#else
-               n_off = current_local_n+a_off
-               a(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i)
-#endif
-               if (next_n_end < next_n) then
-#ifdef WITH_OPENMP
-                 call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, &
-                                     MPI_COMPLEX16, my_prow+1, bottom_recv_tag, &
-                                     mpi_comm_rows, bottom_recv_request(i), mpierr)
-#else
-                 call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, &
-
-                                   mpi_comm_rows, bottom_recv_request(i), mpierr)
-#endif
-               endif
-             endif
-
-             if (current_local_n <= bottom_msg_length + top_msg_length) then
-
-               !wait_t
-               if (top_msg_length>0) then
-#ifdef WITH_OPENMP
-                 call MPI_Wait(top_recv_request(i), mpi_status, mpierr)
-#else
-                 call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-#ifndef WITH_OPENMP
-                 a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
-#endif
-               endif
-
-               !compute
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
-               do my_thread = 1, max_threads
-                 if (top_msg_length>0) then
-                   b_len = csw*top_msg_length
-                   b_off = (my_thread-1)*b_len
-                   a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
-                            reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
-                 endif
-                 call compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads,                &
-                                                          a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-                                                          0, current_local_n, i, my_thread,                                  &
-                                                          THIS_COMPLEX_ELPA_KERNEL)
-               enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%stop("OpenMP parallel")
-#endif
-
-#else
-               call compute_hh_trafo_complex_cpu(a, stripe_width, a_dim2, stripe_count,                             &
-                                                 a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-                                                 0, current_local_n, i, last_stripe_width,                          &
-                                                 THIS_COMPLEX_ELPA_KERNEL)
-#endif
-               !send_b
-#ifdef WITH_OPENMP
-               call MPI_Wait(bottom_send_request(i), mpi_status, mpierr)
-#else
-               call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-               if (bottom_msg_length>0) then
-                 n_off = current_local_n+nbw-bottom_msg_length+a_off
-#ifdef WITH_OPENMP
-                 b_len = csw*bottom_msg_length*max_threads
-                 bottom_border_send_buffer(1:b_len,i) = &
-                          reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
-                 call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, &
-                                     top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
-#else
-                 bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i)
-                 call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, &
-                                top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
-#endif
-               endif
-
-             else
-
-               !compute
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
-               do my_thread = 1, max_threads
-                 call compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads,                   &
-                                                          a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time,    &
-                                                          current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, &
-                                                          THIS_COMPLEX_ELPA_KERNEL)
-               enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%stop("OpenMP parallel")
-#endif
-
-#else
-               call compute_hh_trafo_complex_cpu(a, stripe_width, a_dim2, stripe_count,                             &
-                                                 a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-                                                 current_local_n - bottom_msg_length, bottom_msg_length, i,         &
-                                                 last_stripe_width, THIS_COMPLEX_ELPA_KERNEL)
-
-
-#endif
-               !send_b
-#ifdef WITH_OPENMP
-               call MPI_Wait(bottom_send_request(i), mpi_status, mpierr)
-#else
-
-               call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-               if (bottom_msg_length > 0) then
-                 n_off = current_local_n+nbw-bottom_msg_length+a_off
-#ifdef WITH_OPENMP
-                 b_len = csw*bottom_msg_length*max_threads
-                 bottom_border_send_buffer(1:b_len,i) = &
-                      reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
-                 call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, &
-                                   top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
-#else
-                 bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i)
-                 call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, &
-                              top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
-#endif
-                endif
-
-               !compute
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread), schedule(static, 1)
-               do my_thread = 1, max_threads
-                 call compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads,    &
-                                                          a_off, nbw, max_blk_size, bcast_buffer, kernel_flops,  &
-                                                          kernel_time, top_msg_length,                           &
-                                                          current_local_n-top_msg_length-bottom_msg_length, i,   &
-                                                          my_thread,  THIS_COMPLEX_ELPA_KERNEL)
-               enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%stop("OpenMP parallel")
-#endif
-
-#else
-               call compute_hh_trafo_complex_cpu(a, stripe_width, a_dim2, stripe_count,                               &
-                                                 a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time,   &
-                                                 top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
-                                                 last_stripe_width, THIS_COMPLEX_ELPA_KERNEL)
-
-#endif
-               !wait_t
-               if (top_msg_length>0) then
-#ifdef WITH_OPENMP
-                 call MPI_Wait(top_recv_request(i), mpi_status, mpierr)
-#else
-                 call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-#ifndef WITH_OPENMP
-                 a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
-
-#endif
-               endif
-
-               !compute
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
-               do my_thread = 1, max_threads
-                 if (top_msg_length>0) then
-                   b_len = csw*top_msg_length
-                   b_off = (my_thread-1)*b_len
-                   a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
-                          reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
-                 endif
-                 call compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads,                &
-                                                          a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-                                                          0, top_msg_length, i, my_thread,                                   &
-                                                          THIS_COMPLEX_ELPA_KERNEL)
-               enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%stop("OpenMP parallel")
-#endif
-
-#else
-               call compute_hh_trafo_complex_cpu(a, stripe_width, a_dim2, stripe_count,                              &
-                                                 a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time,  &
-                                                 0, top_msg_length, i, last_stripe_width,                            &
-                                                 THIS_COMPLEX_ELPA_KERNEL)
-#endif
-             endif
-
-             if (next_top_msg_length > 0) then
-               !request top_border data
-#ifdef WITH_OPENMP
-               b_len = csw*next_top_msg_length*max_threads
-               call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_COMPLEX16, my_prow-1, &
-                                 top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
-#else
-               call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, MPI_COMPLEX16, my_prow-1, &
-                               top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
-#endif
-             endif
-
-             !send_t
-             if (my_prow > 0) then
-#ifdef WITH_OPENMP
-               call MPI_Wait(top_send_request(i), mpi_status, mpierr)
-#else
-               call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-
-#ifdef WITH_OPENMP
-               b_len = csw*nbw*max_threads
-               top_border_send_buffer(1:b_len,i) = reshape(a(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /))
-               call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_COMPLEX16, &
-                                 my_prow-1, bottom_recv_tag, &
-                                 mpi_comm_rows, top_send_request(i), mpierr)
-#else
-               top_border_send_buffer(:,1:nbw,i) = a(:,a_off+1:a_off+nbw,i)
-               call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow-1, bottom_recv_tag, &
-                                 mpi_comm_rows, top_send_request(i), mpierr)
-
-#endif
-             endif
-
-             ! Care that there are not too many outstanding top_recv_request's
-             if (stripe_count > 1) then
-               if (i>1) then
-#ifdef WITH_OPENMP
-                 call MPI_Wait(top_recv_request(i-1), mpi_status, mpierr)
-#else
-                 call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr)
-#endif
-               else
-#ifdef WITH_OPENMP
-                 call MPI_Wait(top_recv_request(stripe_count), mpi_status, mpierr)
-#else
-                 call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr)
-#endif
-               endif
-             endif
-
-           enddo
-
-           top_msg_length = next_top_msg_length
-
-         else
-           ! wait for last top_send_request
-           do i = 1, stripe_count
-#ifdef WITH_OPENMP
-             call MPI_Wait(top_send_request(i), mpi_status, mpierr)
-#else
-             call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-           enddo
-         endif
-
-         ! Care about the result
-
-         if (my_prow == 0) then
-
-           ! topmost process sends nbw rows to destination processes
-
-           do j=0,nfact-1
-
-             num_blk = sweep*nfact+j ! global number of destination block, 0 based
-             if (num_blk*nblk >= na) exit
-
-             nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block
-
-#ifdef WITH_OPENMP
-             call MPI_Wait(result_send_request(nbuf), mpi_status, mpierr)
-#else
-             call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr)
-#endif
-
-             dst = mod(num_blk, np_rows)
-
-             if (dst == 0) then
-               do i = 1, min(na - num_blk*nblk, nblk)
-#ifdef WITH_OPENMP
-                 call pack_row_complex_cpu_openmp(a, row, j*nblk+i+a_off, &
-                                                  stripe_width, stripe_count, max_threads, thread_width, l_nev)
-#else
-                 call pack_row_complex_cpu(a, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
-
-#endif
-                 q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:)
-               enddo
-             else
-               do i = 1, nblk
-#ifdef WITH_OPENMP
-                 call pack_row_complex_cpu_openmp(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, &
-                                                  stripe_width, stripe_count, max_threads, thread_width, l_nev)
-#else
-                 call pack_row_complex_cpu(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
-
-#endif
-               enddo
-               call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX16, dst, &
-                                     result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr)
-             endif
-           enddo
-
-         else
-
-           ! receive and store final result
-
-           do j = num_bufs_recvd, num_result_blocks-1
-
-             nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block
-
-             ! If there is still work to do, just test for the next result request
-             ! and leave the loop if it is not ready, otherwise wait for all
-             ! outstanding requests
-
-             if (next_local_n > 0) then
-#ifdef WITH_OPENMP
-               call MPI_Test(result_recv_request(nbuf), flag, mpi_status, mpierr)
-
-#else
-               call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr)
-#endif
-               if (.not.flag) exit
-             else
-#ifdef WITH_OPENMP
-               call MPI_Wait(result_recv_request(nbuf), mpi_status, mpierr)
-
-#else
-
-               call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr)
-#endif
-             endif
-
-               ! Fill result buffer into q
-               num_blk = j*np_rows + my_prow ! global number of current block, 0 based
-               do i = 1, min(na - num_blk*nblk, nblk)
-                 q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf)
-               enddo
-
-               ! Queue result buffer again if there are outstanding blocks left
-               if (j+num_result_buffers < num_result_blocks) &
-                      call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX16, 0, result_recv_tag, &
-                                     mpi_comm_rows, result_recv_request(nbuf), mpierr)
-
-             enddo
-             num_bufs_recvd = j
-
-           endif
-
-           ! Shift the remaining rows to the front of A (if necessary)
-
-           offset = nbw - top_msg_length
-
-           if (offset<0) then
-             if (wantDebug) then
-               write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: internal error, offset for shifting = ',offset
-             endif
-             success = .false.
-             return
-           endif
-
-           a_off = a_off + offset
-           if (a_off + next_local_n + nbw > a_dim2) then
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-             call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, i, j), schedule(static, 1)
-             do my_thread = 1, max_threads
-               do i = 1, stripe_count
-                 do j = top_msg_length+1, top_msg_length+next_local_n
-                   A(:,j,i,my_thread) = A(:,j+a_off,i,my_thread)
-                 enddo
-#else
-             do i = 1, stripe_count
-               do j = top_msg_length+1, top_msg_length+next_local_n
-                 A(:,j,i) = A(:,j+a_off,i)
-#endif
-               enddo
-             enddo
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-             call timer%stop("OpenMP parallel")
-#endif
-#endif
-
-             a_off = 0
-          endif
-        enddo
-
-       ! Just for safety:
-       if (ANY(top_send_request    /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_send_request ***',my_prow,my_pcol
-       if (ANY(bottom_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_send_request ***',my_prow,my_pcol
-       if (ANY(top_recv_request    /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_recv_request ***',my_prow,my_pcol
-       if (ANY(bottom_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_recv_request ***',my_prow,my_pcol
-
-       if (my_prow == 0) then
-#ifdef WITH_OPENMP
-         allocate(mpi_statuses(MPI_STATUS_SIZE,num_result_buffers))
-         call MPI_Waitall(num_result_buffers, result_send_request, mpi_statuses, mpierr)
-         deallocate(mpi_statuses)
-#else
-         call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr)
-#endif
-       endif
-
-       if (ANY(result_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_send_request ***',my_prow,my_pcol
-       if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol
-
-       if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-         write(error_unit,'(" Kernel time:",f10.3," MFlops: ",f10.3)') kernel_time, kernel_flops/kernel_time*1.d-6
-
-       ! deallocate all working space
-
-       deallocate(a)
-       deallocate(row)
-       deallocate(limits)
-       deallocate(result_send_request)
-       deallocate(result_recv_request)
-       deallocate(top_border_send_buffer)
-       deallocate(top_border_recv_buffer)
-       deallocate(bottom_border_send_buffer)
-       deallocate(bottom_border_recv_buffer)
-       deallocate(result_buffer)
-       deallocate(bcast_buffer)
-       deallocate(top_send_request)
-       deallocate(top_recv_request)
-       deallocate(bottom_send_request)
-       deallocate(bottom_recv_request)
-#ifdef HAVE_DETAILED_TIMINGS
-       call timer%stop("trans_ev_tridi_to_band_complex")
-#endif
-       return
-!       contains
-!
-!#ifdef WITH_OPENMP
-!         subroutine compute_hh_trafo_complex(off, ncols, istripe, my_thread, THIS_COMPLEX_ELPA_KERNEL)
-!#else
-!         subroutine compute_hh_trafo_complex(off, ncols, istripe, THIS_COMPLEX_ELPA_KERNEL)
-!#endif
-!           use precision
-!#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
-!           use complex_generic_simple_kernel, only : single_hh_trafo_complex_generic_simple
-!#endif
-!#if defined(WITH_COMPLEX_GENERIC_KERNEL)
-!           use complex_generic_kernel, only : single_hh_trafo_complex_generic
-!#endif
-!#ifdef HAVE_DETAILED_TIMINGS
-!           use timings
-!#endif
-!           implicit none
-!           integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL
-!
-!           ! Private variables in OMP regions (my_thread) should better be in the argument list!
-!
-!           integer(kind=ik)             :: off, ncols, istripe, j, nl, jj
-!#ifdef WITH_OPENMP
-!           integer(kind=ik)             :: my_thread, noff
-!#endif
-!           real(kind=rk)                :: ttt
-!
-!           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-!           !        Currently (on Sandy Bridge), single is faster than double
-!           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-!
-!           complex(kind=ck)             :: w(nbw,2)
-!
-!#ifdef HAVE_DETAILED_TIMINGS
-!           call timer%start("compute_hh_trafo_complex")
-!#endif
-!
-!#ifdef WITH_OPENMP
-!           if (istripe<stripe_count) then
-!             nl = stripe_width
-!           else
-!             noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
-!             nl = min(my_thread*thread_width-noff, l_nev-noff)
-!             if(nl<=0) return
-!           endif
-!#else
-!           nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
-!#endif
-!
-!#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) then
-!#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!             ttt = mpi_wtime()
-!             do j = ncols, 2, -2
-!               w(:,1) = bcast_buffer(1:nbw,j+off)
-!               w(:,2) = bcast_buffer(1:nbw,j+off-1)
-!#ifdef WITH_OPENMP
-!               call double_hh_trafo_complex_sse_avx_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
-!                                                       w, nbw, nl, stripe_width, nbw)
-!#else
-!               call double_hh_trafo_complex_sse_avx_2hv(a(1,j+off+a_off-1,istripe), &
-!                                                       w, nbw, nl, stripe_width, nbw)
-!#endif
-!             enddo
-!#ifdef WITH_OPENMP
-!             if (j==1) call single_hh_trafo_complex_sse_avx_1hv(a(1,1+off+a_off,istripe,my_thread), &
-!                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
-!#else
-!             if (j==1) call single_hh_trafo_complex_sse_avx_1hv(a(1,1+off+a_off,istripe), &
-!                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
-!#endif
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!           endif
-!#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!#endif /* WITH_COMPLEX_AVX_BLOCK2_KERNEL */
-!
-!
-!#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!            if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) then
-!#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!             ttt = mpi_wtime()
-!             do j = ncols, 1, -1
-!#ifdef WITH_OPENMP
-!               call single_hh_trafo_complex_generic_simple(a(1,j+off+a_off,istripe,my_thread), &
-!                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#else
-!               call single_hh_trafo_complex_generic_simple(a(1,j+off+a_off,istripe), &
-!                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#endif
-!             enddo
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!           endif
-!#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */
-!
-!
-!#if defined(WITH_COMPLEX_GENERIC_KERNEL)
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC .or. &
-!               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGP .or. &
-!               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGQ ) then
-!#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!             ttt = mpi_wtime()
-!             do j = ncols, 1, -1
-!#ifdef WITH_OPENMP
-!              call single_hh_trafo_complex_generic(a(1,j+off+a_off,istripe,my_thread), &
-!                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#else
-!              call single_hh_trafo_complex_generic(a(1,j+off+a_off,istripe), &
-!                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#endif
-!            enddo
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!          endif
-!#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!#endif /* WITH_COMPLEX_GENERIC_KERNEL */
-!
-!#if defined(WITH_COMPLEX_SSE_KERNEL)
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) then
-!#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!             ttt = mpi_wtime()
-!             do j = ncols, 1, -1
-!#ifdef WITH_OPENMP
-!              call single_hh_trafo_complex(a(1,j+off+a_off,istripe,my_thread), &
-!                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#else
-!              call single_hh_trafo_complex(a(1,j+off+a_off,istripe), &
-!                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#endif
-!            enddo
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!          endif
-!#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!#endif /* WITH_COMPLEX_SSE_KERNEL */
-!
-!
-!!#if defined(WITH_AVX_SANDYBRIDGE)
-!!              call single_hh_trafo_complex_sse_avx_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!!#endif
-!
-!!#if defined(WITH_AMD_BULLDOZER)
-!!              call single_hh_trafo_complex_sse_avx_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!!#endif
-!
-!#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!          if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) then
-!#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!            ttt = mpi_wtime()
-!            do j = ncols, 1, -1
-!#ifdef WITH_OPENMP
-!              call single_hh_trafo_complex_sse_avx_1hv(a(1,j+off+a_off,istripe,my_thread), &
-!                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#else
-!              call single_hh_trafo_complex_sse_avx_1hv(a(1,j+off+a_off,istripe), &
-!                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#endif
-!            enddo
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!          endif
-!#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNE */
-!
-!#ifdef WITH_OPENMP
-!          if (my_thread==1) then
-!#endif
-!            kernel_flops = kernel_flops + 4*4*int(nl,8)*int(ncols,8)*int(nbw,8)
-!            kernel_time  = kernel_time + mpi_wtime()-ttt
-!#ifdef WITH_OPENMP
-!          endif
-!#endif
-!#ifdef HAVE_DETAILED_TIMINGS
-!          call timer%stop("compute_hh_trafo_complex")
-!#endif
-!
-!
-!        end subroutine compute_hh_trafo_complex
-
-    end subroutine trans_ev_tridi_to_band_complex
-
-#define DATATYPE REAL(kind=rk)
-#define BYTESIZE 8
-#define REALCASE 1
-#include "redist_band.X90"
-#undef DATATYPE
-#undef BYTESIZE
-#undef REALCASE
-
-#define DATATYPE COMPLEX(kind=ck)
-#define BYTESIZE 16
-#define COMPLEXCASE 1
-#include "redist_band.X90"
-#undef DATATYPE
-#undef BYTESIZE
-#undef COMPLEXCASE
-
-    !---------------------------------------------------------------------------------------------------
-    ! divide_band: sets the work distribution in band
-    ! Proc n works on blocks block_limits(n)+1 .. block_limits(n+1)
-
-    subroutine divide_band(nblocks_total, n_pes, block_limits)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-      integer(kind=ik), intent(in)  :: nblocks_total ! total number of blocks in band
-      integer(kind=ik), intent(in)  :: n_pes         ! number of PEs for division
-      integer(kind=ik), intent(out) :: block_limits(0:n_pes)
-
-      integer(kind=ik)              :: n, nblocks, nblocks_left
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("divide_band")
-#endif
-
-      block_limits(0) = 0
-      if (nblocks_total < n_pes) then
-        ! Not enough work for all: The first tasks get exactly 1 block
-        do n=1,n_pes
-          block_limits(n) = min(nblocks_total,n)
-        enddo
-      else
-        ! Enough work for all. If there is no exact loadbalance,
-        ! the LAST tasks get more work since they are finishing earlier!
-        nblocks = nblocks_total/n_pes
-        nblocks_left = nblocks_total - n_pes*nblocks
-        do n=1,n_pes
-          if (n<=n_pes-nblocks_left) then
-            block_limits(n) = block_limits(n-1) + nblocks
-          else
-            block_limits(n) = block_limits(n-1) + nblocks + 1
-          endif
-        enddo
-      endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("divide_band")
-#endif
-
-    end subroutine
-
-    subroutine band_band_real(na, nb, nb2, ab, ab2, d, e, mpi_comm)
-
-    !-------------------------------------------------------------------------------
-    ! band_band_real:
-    ! Reduces a real symmetric banded matrix to a real symmetric matrix with smaller bandwidth. Householder transformations are not stored.
-    ! Matrix size na and original bandwidth nb have to be a multiple of the target bandwidth nb2. (Hint: expand your matrix with
-    ! zero entries, if this
-    ! requirement doesn't hold)
-    !
-    !  na          Order of matrix
-    !
-    !  nb          Semi bandwidth of original matrix
-    !
-    !  nb2         Semi bandwidth of target matrix
-    !
-    !  ab          Input matrix with bandwidth nb. The leading dimension of the banded matrix has to be 2*nb. The parallel data layout
-    !              has to be accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb+1 to min(na, block_limits(n+1)*nb)
-    !              are located on rank n.
-    !
-    !  ab2         Output matrix with bandwidth nb2. The leading dimension of the banded matrix is 2*nb2. The parallel data layout is
-    !              accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb2+1 to min(na, block_limits(n+1)*nb2) are located
-    !              on rank n.
-    !
-    !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
-    !
-    !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
-    !
-    !  mpi_comm
-    !              MPI-Communicator for the total processor set
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik), intent(in)  :: na, nb, nb2, mpi_comm
-      real(kind=rk), intent(inout)  :: ab(2*nb,*)   ! remove assumed size
-      real(kind=rk), intent(inout)  :: ab2(2*nb2,*) ! remove assumed size
-      real(kind=rk), intent(out)    :: d(na), e(na) ! set only on PE 0
-
-      real(kind=rk)                 :: hv(nb,nb2), w(nb,nb2), w_new(nb,nb2), tau(nb2), hv_new(nb,nb2), &
-                                       tau_new(nb2), ab_s(1+nb,nb2), ab_r(1+nb,nb2), ab_s2(2*nb2,nb2), hv_s(nb,nb2)
-
-      real(kind=rk)                 :: work(nb*nb2), work2(nb2*nb2)
-      integer(kind=ik)              :: lwork, info
-
-      integer(kind=ik)              :: istep, i, n, dest
-      integer(kind=ik)              :: n_off, na_s
-      integer(kind=ik)              :: my_pe, n_pes, mpierr
-      integer(kind=ik)              :: nblocks_total, nblocks
-      integer(kind=ik)              :: nblocks_total2, nblocks2
-      integer(kind=ik)              :: ireq_ab, ireq_hv
-      integer(kind=ik)              :: mpi_status(MPI_STATUS_SIZE)
-      integer(kind=ik), allocatable :: mpi_statuses(:,:)
-      integer(kind=ik), allocatable :: block_limits(:), block_limits2(:), ireq_ab2(:)
-
-      integer(kind=ik)              :: j, nc, nr, ns, ne, iblk
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("band_band_real")
-#endif
-      call mpi_comm_rank(mpi_comm,my_pe,mpierr)
-      call mpi_comm_size(mpi_comm,n_pes,mpierr)
-
-      ! Total number of blocks in the band:
-      nblocks_total = (na-1)/nb + 1
-      nblocks_total2 = (na-1)/nb2 + 1
-
-      ! Set work distribution
-      allocate(block_limits(0:n_pes))
-      call divide_band(nblocks_total, n_pes, block_limits)
-
-      allocate(block_limits2(0:n_pes))
-      call divide_band(nblocks_total2, n_pes, block_limits2)
-
-      ! nblocks: the number of blocks for my task
-      nblocks = block_limits(my_pe+1) - block_limits(my_pe)
-      nblocks2 = block_limits2(my_pe+1) - block_limits2(my_pe)
-
-      allocate(ireq_ab2(1:nblocks2))
-      ireq_ab2 = MPI_REQUEST_NULL
-      if (nb2>1) then
-        do i=0,nblocks2-1
-          call mpi_irecv(ab2(1,i*nb2+1),2*nb2*nb2,mpi_real8,0,3,mpi_comm,ireq_ab2(i+1),mpierr)
-        enddo
-      endif
-
-      ! n_off: Offset of ab within band
-      n_off = block_limits(my_pe)*nb
-      lwork = nb*nb2
-      dest = 0
-
-      ireq_ab = MPI_REQUEST_NULL
-      ireq_hv = MPI_REQUEST_NULL
-
-      ! ---------------------------------------------------------------------------
-      ! Start of calculations
-
-      na_s = block_limits(my_pe)*nb + 1
-
-      if (my_pe>0 .and. na_s<=na) then
-        ! send first nb2 columns to previous PE
-        ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
-        do i=1,nb2
-          ab_s(1:nb+1,i) = ab(1:nb+1,na_s-n_off+i-1)
-        enddo
-        call mpi_isend(ab_s,(nb+1)*nb2,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
-      endif
-
-      do istep=1,na/nb2
-
-        if (my_pe==0) then
-
-          n = MIN(na-na_s-nb2+1,nb) ! number of rows to be reduced
-          hv(:,:) = 0
-          tau(:) = 0
-
-          ! The last step (istep=na-1) is only needed for sending the last HH vectors.
-          ! We don't want the sign of the last element flipped (analogous to the other sweeps)
-          if (istep < na/nb2) then
-
-            ! Transform first block column of remaining matrix
-            call dgeqrf(n, nb2, ab(1+nb2,na_s-n_off), 2*nb-1, tau, work, lwork, info);
-
-            do i=1,nb2
-              hv(i,i) = 1.0
-              hv(i+1:n,i) = ab(1+nb2+1:1+nb2+n-i,na_s-n_off+i-1)
-              ab(1+nb2+1:2*nb,na_s-n_off+i-1) = 0
-            enddo
-
-          endif
-
-          if (nb2==1) then
-            d(istep) = ab(1,na_s-n_off)
-            e(istep) = ab(2,na_s-n_off)
-            if (istep == na) then
-              e(na) = 0
-            endif
-          else
-            ab_s2 = 0
-            ab_s2(:,:) = ab(1:nb2+1,na_s-n_off:na_s-n_off+nb2-1)
-            if (block_limits2(dest+1)<istep) then
-              dest = dest+1
-            endif
-            call mpi_send(ab_s2,2*nb2*nb2,mpi_real8,dest,3,mpi_comm,mpierr)
-          endif
-
-        else
-          if (na>na_s+nb2-1) then
-            ! Receive Householder vectors from previous task, from PE owning subdiagonal
-            call mpi_recv(hv,nb*nb2,mpi_real8,my_pe-1,2,mpi_comm,mpi_status,mpierr)
-            do i=1,nb2
-              tau(i) = hv(i,i)
-              hv(i,i) = 1.
-            enddo
-          endif
-        endif
-
-        na_s = na_s+nb2
-        if (na_s-n_off > nb) then
-          ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
-          ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0
-          n_off = n_off + nb
-        endif
-
-        do iblk=1,nblocks
-         ns = na_s + (iblk-1)*nb - n_off ! first column in block
-         ne = ns+nb-nb2                    ! last column in block
-
-         if (ns+n_off>na) exit
-
-           nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
-           nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
-                                         ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
-
-           call wy_gen(nc,nb2,w,hv,tau,work,nb)
-
-           if (iblk==nblocks .and. nc==nb) then
-             !request last nb2 columns
-             call mpi_recv(ab_r,(nb+1)*nb2,mpi_real8,my_pe+1,1,mpi_comm,mpi_status,mpierr)
-             do i=1,nb2
-	       ab(1:nb+1,ne+i-1) = ab_r(:,i)
-             enddo
-           endif
-
-           hv_new(:,:) = 0 ! Needed, last rows must be 0 for nr < nb
-           tau_new(:) = 0
-
-           if (nr>0) then
-             call wy_right(nr,nb,nb2,ab(nb+1,ns),2*nb-1,w,hv,work,nb)
-
-             call dgeqrf(nr,nb2,ab(nb+1,ns),2*nb-1,tau_new,work,lwork,info);
-
-             do i=1,nb2
-	       hv_new(i,i) = 1.0
-	       hv_new(i+1:,i) = ab(nb+2:2*nb-i+1,ns+i-1)
-	       ab(nb+2:,ns+i-1) = 0
-	     enddo
-
-	     !send hh-vector
-	     if (iblk==nblocks) then
-	       call mpi_wait(ireq_hv,mpi_status,mpierr)
-	       hv_s = hv_new
-	       do i=1,nb2
-	         hv_s(i,i) = tau_new(i)
-               enddo
-	       call mpi_isend(hv_s,nb*nb2,mpi_real8,my_pe+1,2,mpi_comm,ireq_hv,mpierr)
-             endif
-
-           endif
-
-	   call wy_symm(nc,nb2,ab(1,ns),2*nb-1,w,hv,work,work2,nb)
-
-           if (my_pe>0 .and. iblk==1) then
-	     !send first nb2 columns to previous PE
-	     call mpi_wait(ireq_ab,mpi_status,mpierr)
-	     do i=1,nb2
-	       ab_s(1:nb+1,i) = ab(1:nb+1,ns+i-1)
-	     enddo
-	     call mpi_isend(ab_s,(nb+1)*nb2,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
-           endif
-
-           if (nr>0) then
-             call wy_gen(nr,nb2,w_new,hv_new,tau_new,work,nb)
-	     call wy_left(nb-nb2,nr,nb2,ab(nb+1-nb2,ns+nb2),2*nb-1,w_new,hv_new,work,nb)
-           endif
-
-           ! Use new HH vector for the next block
-	   hv(:,:) = hv_new(:,:)
-           tau = tau_new
-         enddo
-       enddo
-
-       ! Finish the last outstanding requests
-       call mpi_wait(ireq_ab,mpi_status,mpierr)
-       call mpi_wait(ireq_hv,mpi_status,mpierr)
-       allocate(mpi_statuses(MPI_STATUS_SIZE,nblocks2))
-       call mpi_waitall(nblocks2,ireq_ab2,mpi_statuses,mpierr)
-       deallocate(mpi_statuses)
-
-       call mpi_barrier(mpi_comm,mpierr)
-
-       deallocate(block_limits)
-       deallocate(block_limits2)
-       deallocate(ireq_ab2)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("band_band_real")
-#endif
-
-    end subroutine
-
-    subroutine wy_gen(n, nb, W, Y, tau, mem, lda)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-      integer(kind=ik), intent(in) :: n		!length of householder-vectors
-      integer(kind=ik), intent(in) :: nb		!number of householder-vectors
-      integer(kind=ik), intent(in) :: lda		!leading dimension of Y and W
-      real(kind=rk), intent(in)    :: Y(lda,nb)	!matrix containing nb householder-vectors of length b
-      real(kind=rk), intent(in)    :: tau(nb)	!tau values
-      real(kind=rk), intent(out)   :: W(lda,nb)	!output matrix W
-      real(kind=rk), intent(in)    :: mem(nb)	!memory for a temporary matrix of size nb
-
-      integer(kind=ik)             :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("wy_gen")
-#endif
-
-      W(1:n,1) = tau(1)*Y(1:n,1)
-      do i=2,nb
-        W(1:n,i) = tau(i)*Y(1:n,i)
-        call DGEMV('T',n,i-1,1.d0,Y,lda,W(1,i),1,0.d0,mem,1)
-        call DGEMV('N',n,i-1,-1.d0,W,lda,mem,1,1.d0,W(1,i),1)
-      enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("wy_gen")
-#endif
-
-    end subroutine
-
-    subroutine wy_left(n, m, nb, A, lda, W, Y, mem, lda2)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-      integer(kind=ik), intent(in) :: n		!width of the matrix A
-      integer(kind=ik), intent(in) :: m		!length of matrix W and Y
-      integer(kind=ik), intent(in) :: nb		!width of matrix W and Y
-      integer(kind=ik), intent(in) :: lda		!leading dimension of A
-      integer(kind=ik), intent(in) :: lda2		!leading dimension of W and Y
-      real(kind=rk), intent(inout) :: A(lda,*)	!matrix to be transformed   ! remove assumed size
-      real(kind=rk), intent(in)    :: W(m,nb)	!blocked transformation matrix W
-      real(kind=rk), intent(in)    :: Y(m,nb)	!blocked transformation matrix Y
-      real(kind=rk), intent(inout) :: mem(n,nb)	!memory for a temporary matrix of size n x nb
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("wy_left")
-#endif
-
-      call DGEMM('T', 'N', nb, n, m, 1.d0, W, lda2, A, lda, 0.d0, mem, nb)
-      call DGEMM('N', 'N', m, n, nb, -1.d0, Y, lda2, mem, nb, 1.d0, A, lda)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("wy_left")
-#endif
-
-    end subroutine
-
-    subroutine wy_right(n, m, nb, A, lda, W, Y, mem, lda2)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-      integer(kind=ik), intent(in) :: n		!height of the matrix A
-      integer(kind=ik), intent(in) :: m		!length of matrix W and Y
-      integer(kind=ik), intent(in) :: nb		!width of matrix W and Y
-      integer(kind=ik), intent(in) :: lda		!leading dimension of A
-      integer(kind=ik), intent(in) :: lda2		!leading dimension of W and Y
-      real(kind=rk), intent(inout) :: A(lda,*)	!matrix to be transformed  ! remove assumed size
-      real(kind=rk), intent(in)    :: W(m,nb)	!blocked transformation matrix W
-      real(kind=rk), intent(in)    :: Y(m,nb)	!blocked transformation matrix Y
-      real(kind=rk), intent(inout) :: mem(n,nb)	!memory for a temporary matrix of size n x nb
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("wy_right")
-#endif
-
-      call DGEMM('N', 'N', n, nb, m, 1.d0, A, lda, W, lda2, 0.d0, mem, n)
-      call DGEMM('N', 'T', n, m, nb, -1.d0, mem, n, Y, lda2, 1.d0, A, lda)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("wy_right")
-#endif
-
-    end subroutine
-
-    subroutine wy_symm(n, nb, A, lda, W, Y, mem, mem2, lda2)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-      integer(kind=ik), intent(in) :: n		!width/heigth of the matrix A; length of matrix W and Y
-      integer(kind=ik), intent(in) :: nb		!width of matrix W and Y
-      integer(kind=ik), intent(in) :: lda		!leading dimension of A
-      integer(kind=ik), intent(in) :: lda2		!leading dimension of W and Y
-      real(kind=rk), intent(inout) :: A(lda,*)	!matrix to be transformed  ! remove assumed size
-      real(kind=rk), intent(in)    :: W(n,nb)	!blocked transformation matrix W
-      real(kind=rk), intent(in)    :: Y(n,nb)	!blocked transformation matrix Y
-      real(kind=rk)                :: mem(n,nb)	!memory for a temporary matrix of size n x nb
-      real(kind=rk)                :: mem2(nb,nb)	!memory for a temporary matrix of size nb x nb
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("wy_symm")
-#endif
-
-      call DSYMM('L', 'L', n, nb, 1.d0, A, lda, W, lda2, 0.d0, mem, n)
-      call DGEMM('T', 'N', nb, nb, n, 1.d0, mem, n, W, lda2, 0.d0, mem2, nb)
-      call DGEMM('N', 'N', n, nb, nb, -0.5d0, Y, lda2, mem2, nb, 1.d0, mem, n)
-      call DSYR2K('L', 'N', n, nb, -1.d0, Y, lda2, mem, n, 1.d0, A, lda)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("wy_symm")
-#endif
-    end subroutine
-end module ELPA2_compute
diff --git a/src/elpa2_kernels/README_elpa2_kernels.txt b/src/elpa2_kernels/README_elpa2_kernels.txt
deleted file mode 100644
index 263607882..000000000
--- a/src/elpa2_kernels/README_elpa2_kernels.txt
+++ /dev/null
@@ -1,156 +0,0 @@
-This file is intended as guideline for choosing one appropiate
-ELPA2-kernel for your installation.
-
-ELPA generally uses BLAS-Routines for all compute intensive work
-so that the performance of ELPA mainly depends on the quality of
-the BLAS implementation used when linking.
-
-The only exception is the backtransformation of the eigenvectors
-for the 2-stage solver (ELPA2). In this case BLAS routines cannot
-be used effectively due to the nature of the problem.
-
-The compute intensive part of the backtransformation of ELPA2
-has been put to a file of its own (elpa2_kernels.f90) so that
-this can be replaced by hand tailored, optimized code for
-specific platforms.
-
-However, we cannot choose for you the best kernels, you should read
-these hints, and maybe try which kernel works best for you.
-
-Currently we offer the following alternatives for the ELPA2 kernels:
-
-* elpa2_kernels_{real|complex}.f90
-
-                             - The generic FORTRAN version of the ELPA2 kernels
-                               which should be useable on every platform.
-                               It contains some hand optimizations (loop unrolling)
-                               in the hope to get optimal code from most FORTRAN
-                               compilers. The configure option "--with-generic"
-                               uses these kernels. They are propably a good
-                               default if you do not know which kernel
-                               to use. Note that in the real version,
-                               there is used a complex variable in
-                               order to enforce better compiler
-                               optimizations. This produces correct
-                               code, however, some compilers might
-                               produce a warning.
-
-
-
-* elpa2_kernels_{real|complex}_simple.f90
-
-                             - Plain and simple version of elpa2_kernels.f90.
-                               Please note that we observed that some compilers get
-                               get confused by the hand optimizations done in
-                               elpa2_kernels_{real|complex}.f90 and
-                               give better performance with this
-                               version - so it is worth to try both!
-                               The configure option "--with-generic-simple"
-                               uses these kernels.
-
-* elpa2_kernels_real_bgp.f90
-                             - Fortran code enhanced with assembler calls
-                               for the IBM BlueGene/P. For the complex
-                               eigenvalue problem the "elpa2_kernels_complex.f90"
-                               is recommended. The configure option
-                               "--with-generic-bgp" uses these
-			       kernels. Note that the OpenMP functionality of
-			       this kernel is not yet tested and thus an
-			       preprocessor error is thrown in the combination
-			       of this kernel with OpenMP. By manually editing
-			       the file src/elpa2.F90 one can avoid this and
-			       test the OpenMP functionality. The ELPA
-			       developers would welcome every feedback to this
-			       subject.
-
-* elpa2_kernels_real_bgq.f90
-                             - Fortran code enhanced with assembler calls
-                               for the IBM BlueGene/Q. For the complex
-                               eigenvalue problem the "elpa2_kernels_complex.f90"
-                               is recommended. The configure option
-                               "--with-generic-bgq" uses these
-			       kernels. Note that the OpenMP functionality of
-			       this kernel is not yet tested and thus an
-			       preprocessor error is thrown in the combination
-			       of this kernel with OpenMP. By manually editing
-			       the file src/elpa2.F90 one can avoid this and
-       			       test the OpenMP functionality. The ELPA
-                               developers would welcome every feedback
-			       to this subject.
-
-* elpa2_kernels_asm_x86_64.s
-                             - Fortran code enhanced with assembler
-                               for the SSE vectorization. The configure option
-                               "--with-sse-assembler" uses these kernels.
-                               They are worth trying on x86_64 without AVX,
-        		       e.g. Intel Nehalem.
-
-
-
-Several
-
-* elpa2_kernels_{real|complex}_sse-avx_*.c(pp)
-                             - Optimized intrinisic code for x86_64
-                               systems (i.e. Intel/AMD architecture)
-                               using SSE2/SSE3 operations.
-                               (Use gcc for compiling as Intel
-			       compiler generates slower code!)
-
-			       Note that you have to specify with
-                               configure the flags
-         		       CFLAGS="-O3 -mavx -funsafe-loop-optimizations \
-			       -funsafe-math-optimizations -ftree-vect-loop-version \
-			       -ftree-vectorize"
-			       and
-			       CXXFLAGS="-O3 -mavx -funsafe-loop-optimizations \
-			       -funsafe-math-optimizations -ftree-vect-loop-version \
-			       -ftree-vectorize"
-			       for best performace results.
-
-                               For convenience the flag
-                               "--with-avx-optimization" sets these
-                               CFLAGS and CXXFLAGS automatically.
-
-                               On Intel Sandybridge architectures the
-                               configure option "--with-avx-sandybride"
-			       uses the best combination, which is a
-                               combination of block2 for real matrices
-                               and block1 for complex matrices.
-
-                               On AMD Bulldozer architectures the
-                               configure option "--with-amd-bulldozer"
-			       uses the best combination, which is a
-			       combination of block4 for real matrices
-			       and block1 for complex matrices.
-
-			       Otherwise, you can try out your own
-			       combinations with the configure options
-                               "--with-avx-complex-block{1|2}" and
-                               "--with-avx-real-block{2|4|6}".
-
-
-
-
-So which version should be used?
-================================
-
-* On the IBM BlueGene/P, BlueGene/Q,
-  you should get the optimal performance using the optimized intrinsics/assembler versions
-  elpa2_kernels_{real|complex}_bg{p|q}.f90, respectively.
-
-
-* On x86_64 systems (i.e. almost all Intel/AMD systems) you should get
-  the optimal performance using the optimized intrinsics/assembler versions
-  in elpa2_kernels_*.c or elpa2_kernels_{real|complex}_bg{p|q}.f90
-  respectively. However, here you have quite some choice to find your
-  optimal kernel.
-
-* If you don't compile for one of these systems or you don't like to use assembler
-  for any reason, it is likely that you are best off using elpa2_kernels.f90.
-  Make a perfomance test with elpa2_kernels_simple.f90, however, to check if
-  your compiler doesn't get confused by the hand optimizations.
-
-* If you want to develop your own optimized kernels for you platform, it is
-  easier to start with elpa2_kernels_simple.f90.
-  Don't let you confuse from the huge code in elpa2_kernels.f90, the mathemathics
-  done in the kernels is relatively trivial.
diff --git a/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s b/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
deleted file mode 100644
index 1ebcf625e..000000000
--- a/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
+++ /dev/null
@@ -1,701 +0,0 @@
-# --------------------------------------------------------------------------------------------------
-#
-# This file contains the compute intensive kernels for the Householder transformations,
-# coded in x86_64 assembler and using SSE2/SSE3 instructions.
-#
-# It must be assembled with GNU assembler (just "as" on most Linux machines)
-# 
-# Copyright of the original code rests with the authors inside the ELPA
-# consortium. The copyright of any additional modifications shall rest
-# with their original authors, but shall adhere to the licensing terms
-# distributed along with the original code in the file "COPYING".
-#
-# --------------------------------------------------------------------------------------------------
-        .globl double_hh_trafo_
-        .globl single_hh_trafo_complex_
-        .text
-
-#-------------------------------------------------------------------------------
-#-------------------------------------------------------------------------------
-
-        .macro hh_trafo_real nrows
-
-        # When this macro is called, the following registers are set and must not be changed
-        # %rdi: Address of q
-        # %rsi: Address of hh
-        # %rdx: nb
-        # %rcx: Remaining rows nq
-        # %r8:  ldq in bytes
-        # %r9:  ldh in bytes
-        # %rax: address of hh at the end of the loops
-        # The top of the stack must contain the dot product of the two Householder vectors
-
-        movq      %rdi, %r10   # Copy address of q
-        movq      %rsi, %r11   # Copy address of hh
-
-
-#   x1 = q(1,2)
-#   x2 = q(2,2)
-#
-#   y1 = q(1,1) + q(1,2)*hh(2,2)
-#   y2 = q(2,1) + q(2,2)*hh(2,2)
-
-        movaps      (%r10), %xmm6       # y1 = q(1,1)
-        movaps    16(%r10), %xmm7       # y2 = q(2,1)
-        .if \nrows>=8
-        movaps    32(%r10), %xmm8
-        movaps    48(%r10), %xmm9
-        .if \nrows==12
-        movaps    64(%r10), %xmm10
-        movaps    80(%r10), %xmm11
-        .endif
-        .endif
-
-        addq      %r8, %r10             # %r10 => q(.,2)
-        movddup   8(%r11,%r9), %xmm15   #  hh(2,2)
-
-        .macro mac_pre_loop1 qoff, X, Y
-        movaps    \qoff(%r10), \X       # xn = q(n,2)
-        movaps    \X, %xmm12
-        mulpd     %xmm15, %xmm12
-        addpd     %xmm12, \Y            # yn = yn + xn*h(2,2)
-        .endm
-
-        mac_pre_loop1  0, %xmm0, %xmm6
-        mac_pre_loop1 16, %xmm1, %xmm7
-        .if \nrows>=8
-        mac_pre_loop1 32, %xmm2, %xmm8
-        mac_pre_loop1 48, %xmm3, %xmm9
-        .if \nrows==12
-        mac_pre_loop1 64, %xmm4, %xmm10
-        mac_pre_loop1 80, %xmm5, %xmm11
-        .endif
-        .endif
-        .purgem   mac_pre_loop1
-
-#   do i=3,nb
-#      h1 = hh(i-1,1)
-#      h2 = hh(i,2)
-#      x1 = x1 + q(1,i)*h1
-#      y1 = y1 + q(1,i)*h2
-#      x2 = x2 + q(2,i)*h1
-#      y2 = y2 + q(2,i)*h2
-#      ...
-#   enddo
-
-        addq      $8, %r11
-        .align 16
-1:
-        cmpq %rax, %r11                 # Jump out of the loop if %r11 >= %rax
-        jge       2f
-
-        addq      %r8, %r10             # %r10 => q(.,i)
-
-        movddup   (%r11), %xmm14        # hh(i-1,1)
-        movddup   8(%r11,%r9), %xmm15   # hh(i,2)
-
-        .macro mac_loop1 qoff, X, Y
-        movaps    \qoff(%r10), %xmm13   # q(.,i)
-        movaps    %xmm13, %xmm12
-        mulpd     %xmm14, %xmm13
-        addpd     %xmm13, \X            # xn = xn + q(.,i)*h1
-        mulpd     %xmm15, %xmm12
-        addpd     %xmm12, \Y            # yn = yn + q(.,i)*h2
-        .endm
-
-        mac_loop1  0, %xmm0, %xmm6
-        mac_loop1 16, %xmm1, %xmm7
-        .if \nrows>=8
-        mac_loop1 32, %xmm2, %xmm8
-        mac_loop1 48, %xmm3, %xmm9
-        .if \nrows==12
-        mac_loop1 64, %xmm4, %xmm10
-        mac_loop1 80, %xmm5, %xmm11
-        .endif
-        .endif
-        .purgem   mac_loop1
-
-        addq      $8, %r11
-        jmp       1b
-2:
-
-#   x1 = x1 + q(1,nb+1)*hh(nb,1)
-#   x2 = x2 + q(2,nb+1)*hh(nb,1)
-
-        addq      %r8, %r10             # %r10 => q(.,nb+1)
-        movddup   (%r11), %xmm14
-
-        .macro mac_post_loop1 qoff, X
-        movaps    \qoff(%r10), %xmm13   # q(.,nb+1)
-        mulpd     %xmm14, %xmm13
-        addpd     %xmm13, \X
-        .endm
-
-        mac_post_loop1  0, %xmm0
-        mac_post_loop1 16, %xmm1
-        .if \nrows>=8
-        mac_post_loop1 32, %xmm2
-        mac_post_loop1 48, %xmm3
-        .if \nrows==12
-        mac_post_loop1 64, %xmm4
-        mac_post_loop1 80, %xmm5
-        .endif
-        .endif
-        .purgem   mac_post_loop1
-
-#   tau1 = hh(1,1)
-#   tau2 = hh(1,2)
-#
-#   h1 = -tau1
-#   x1 = x1*h1
-#   x2 = x2*h1
-
-        movq      %rsi, %r11    # restore %r11 (hh(1,1))
-
-        movddup (%r11), %xmm12 # hh(1,1)
-        xorps   %xmm14, %xmm14
-        subpd   %xmm12, %xmm14 # %xmm14 = -hh(1,1)
-
-        mulpd   %xmm14, %xmm0
-        mulpd   %xmm14, %xmm1
-        .if \nrows>=8
-        mulpd   %xmm14, %xmm2
-        mulpd   %xmm14, %xmm3
-        .if \nrows==12
-        mulpd   %xmm14, %xmm4
-        mulpd   %xmm14, %xmm5
-        .endif
-        .endif
-
-#   h1 = -tau2
-#   h2 = -tau2*s
-#   y1 = y1*h1 + x1*h2
-#   y2 = y2*h1 + x2*h2
-
-        movddup (%r11,%r9), %xmm12  # hh(1,2)
-        xorps   %xmm15, %xmm15
-        subpd   %xmm12, %xmm15 # %xmm15 = -hh(1,2) = h1
-        movaps  %xmm15, %xmm14
-        movddup (%rsp), %xmm12 # Get s from top of stack
-        mulpd   %xmm12, %xmm14 # %xmm14 = h2
-
-        .macro mac_xform_y X, Y
-        mulpd   %xmm15, \Y  # y1 = y1*h1
-        movaps  \X, %xmm12
-        mulpd   %xmm14, %xmm12
-        addpd   %xmm12, \Y
-        .endm
-
-        mac_xform_y %xmm0, %xmm6
-        mac_xform_y %xmm1, %xmm7
-        .if \nrows>=8
-        mac_xform_y %xmm2, %xmm8
-        mac_xform_y %xmm3, %xmm9
-        .if \nrows==12
-        mac_xform_y %xmm4, %xmm10
-        mac_xform_y %xmm5, %xmm11
-        .endif
-        .endif
-        .purgem   mac_xform_y
-
-#   q(1,1) = q(1,1) + y1
-#   q(2,1) = q(2,1) + y2
-
-        movq   %rdi, %r10   # restore original Q
-
-        .macro mac_pre_loop2_1 qoff, Y
-        movaps    \qoff(%r10), %xmm13   # q(.,1)
-        addpd     \Y, %xmm13
-        movaps    %xmm13, \qoff(%r10)
-        .endm
-
-        mac_pre_loop2_1  0, %xmm6
-        mac_pre_loop2_1 16, %xmm7
-        .if \nrows>=8
-        mac_pre_loop2_1 32, %xmm8
-        mac_pre_loop2_1 48, %xmm9
-        .if \nrows==12
-        mac_pre_loop2_1 64, %xmm10
-        mac_pre_loop2_1 80, %xmm11
-        .endif
-        .endif
-        .purgem   mac_pre_loop2_1
-
-#   q(1,2) = q(1,2) + x1 + y1*hh(2,2)
-#   q(2,2) = q(2,2) + x2 + y2*hh(2,2)
-
-        addq      %r8, %r10             # %r10 => q(.,2)
-
-        movddup   8(%r11,%r9), %xmm15   # hh(2,2)
-
-        .macro mac_pre_loop2_2 qoff, X, Y
-        movaps    \X, %xmm13
-        movaps    \Y, %xmm12
-        mulpd     %xmm15, %xmm12
-        addpd     %xmm12, %xmm13
-        addpd     \qoff(%r10), %xmm13
-        movaps    %xmm13, \qoff(%r10)
-        .endm
-
-        mac_pre_loop2_2  0, %xmm0, %xmm6
-        mac_pre_loop2_2 16, %xmm1, %xmm7
-        .if \nrows>=8
-        mac_pre_loop2_2 32, %xmm2, %xmm8
-        mac_pre_loop2_2 48, %xmm3, %xmm9
-        .if \nrows==12
-        mac_pre_loop2_2 64, %xmm4, %xmm10
-        mac_pre_loop2_2 80, %xmm5, %xmm11
-        .endif
-        .endif
-        .purgem   mac_pre_loop2_2
-
-#   do i=3,nb
-#      h1 = hh(i-1,1)
-#      h2 = hh(i,2)
-#      q(1,i) = q(1,i) + x1*h1 + y1*h2
-#      q(2,i) = q(2,i) + x2*h1 + y2*h2
-#   enddo
-
-        addq      $8, %r11
-        .align 16
-1:
-        cmpq %rax, %r11                 # Jump out of the loop if %r11 >= %rax
-        jge       2f
-
-        addq      %r8, %r10             # %r10 => q(.,i)
-
-        movddup   (%r11), %xmm14        # hh(i-1,1)
-        movddup   8(%r11,%r9), %xmm15   # hh(i,2)
-
-        .macro mac_loop2 qoff, X, Y
-        movaps    \X, %xmm13
-        mulpd     %xmm14, %xmm13
-        movaps    \Y, %xmm12
-        mulpd     %xmm15, %xmm12
-        addpd     %xmm12, %xmm13
-        addpd     \qoff(%r10), %xmm13
-        movaps    %xmm13, \qoff(%r10)
-        .endm
-
-        mac_loop2  0, %xmm0, %xmm6
-        mac_loop2 16, %xmm1, %xmm7
-        .if \nrows>=8
-        mac_loop2 32, %xmm2, %xmm8
-        mac_loop2 48, %xmm3, %xmm9
-        .if \nrows==12
-        mac_loop2 64, %xmm4, %xmm10
-        mac_loop2 80, %xmm5, %xmm11
-        .endif
-        .endif
-        .purgem   mac_loop2
-
-        addq      $8, %r11
-        jmp       1b
-2:
-
-#   q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
-#   q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
-
-        addq      %r8, %r10             # %r10 => q(.,nb+1)
-        movddup   (%r11), %xmm14
-
-        .macro mac_post_loop2 qoff, X
-        movaps    \qoff(%r10), %xmm13   # q(.,nb+1)
-        mulpd     %xmm14, \X
-        addpd     \X, %xmm13
-        movaps    %xmm13, \qoff(%r10)
-        .endm
-
-        mac_post_loop2  0, %xmm0
-        mac_post_loop2 16, %xmm1
-        .if \nrows>=8
-        mac_post_loop2 32, %xmm2
-        mac_post_loop2 48, %xmm3
-        .if \nrows==12
-        mac_post_loop2 64, %xmm4
-        mac_post_loop2 80, %xmm5
-        .endif
-        .endif
-        .purgem   mac_post_loop2
-
-        .endm
-
-#-------------------------------------------------------------------------------
-#-------------------------------------------------------------------------------
-# FORTRAN Interface:
-#
-# subroutine double_hh_trafo(q, hh, nb, nq, ldq, ldh)
-#
-#   integer, intent(in) :: nb, nq, ldq, ldh
-#   real*8, intent(inout) :: q(ldq,*)
-#   real*8, intent(in) :: hh(ldh,*)
-#
-# Parameter mapping to registers
-#   parameter 1: %rdi : q
-#   parameter 2: %rsi : hh
-#   parameter 3: %rdx : nb
-#   parameter 4: %rcx : nq
-#   parameter 5: %r8  : ldq
-#   parameter 6: %r9  : ldh
-#
-#-------------------------------------------------------------------------------
-        .align    16,0x90
-double_hh_trafo_:
-
-        # Get integer parameters into corresponding registers
-
-        movslq    (%rdx), %rdx # nb
-        movslq    (%rcx), %rcx # nq
-        movslq    (%r8),  %r8  # ldq
-        movslq    (%r9),  %r9  # ldh
-
-        # Get ldq in bytes
-        addq      %r8, %r8
-        addq      %r8, %r8
-        addq      %r8, %r8 # 8*ldq, i.e. ldq in bytes
-
-        # Get ldh in bytes
-        addq      %r9, %r9
-        addq      %r9, %r9
-        addq      %r9, %r9 # 8*ldh, i.e. ldh in bytes
-
-        # set %rax to the address of hh at the end of the loops,
-        # i.e. if %rdx >= %rax we must jump out of the loop.
-        # please note: %rax = 8*%rdx + %rsi - 8
-        movq %rdx, %rax
-        addq %rax, %rax
-        addq %rax, %rax
-        addq %rax, %rax
-        addq %rsi, %rax
-        subq $8, %rax
-
-#-----------------------------------------------------------
-        # Calculate the dot product of the two Householder vectors
-
-        # decrement stack pointer to make space for s
-        subq $8, %rsp
-
-#   Fortran code:
-#   s = hh(2,2)*1
-#   do i=3,nb
-#      s = s+hh(i,2)*hh(i-1,1)
-#   enddo
-
-        movq      %rsi, %r11   # Copy address of hh
-
-        movsd     8(%r11,%r9), %xmm0 #  hh(2,2)
-        addq      $8, %r11
-1:
-        cmpq %rax, %r11
-        jge       2f
-        movsd   (%r11), %xmm14       # hh(i-1,1)
-        movsd   8(%r11,%r9), %xmm15  # hh(i,2)
-        mulsd   %xmm14, %xmm15
-        addsd   %xmm15, %xmm0
-        addq      $8, %r11
-        jmp       1b
-2:
-        movsd   %xmm0, (%rsp)   # put s on top of stack
-#-----------------------------------------------------------
-
-rloop_s:
-        cmpq      $8, %rcx   # if %rcx <= 8 jump out of loop
-        jle       rloop_e
-        hh_trafo_real 12 # transform 12 rows
-        addq      $96, %rdi  # increment q start adress by 96 bytes (6 rows)
-        subq      $12, %rcx  # decrement nq
-        jmp       rloop_s
-rloop_e:
-
-        cmpq      $4, %rcx   # if %rcx <= 4 jump to test_2
-        jle       test_4
-        hh_trafo_real 8 # transform 8 rows
-        jmp       return1
-
-test_4:
-        cmpq      $0, %rcx   # if %rcx <= 0 jump to return
-        jle       return1
-        hh_trafo_real 4 # transform 4 rows
-
-return1:
-        addq      $8, %rsp   # reset stack pointer
-        ret
-
-        .align    16,0x90
-
-#-------------------------------------------------------------------------------
-#-------------------------------------------------------------------------------
-
-        .macro hh_trafo_complex nrows
-
-        # When this macro is called, the following registers are set and must not be changed
-        # %rdi: Address of q
-        # %rsi: Address of hh
-        # %rdx: nb
-        # %rcx: Remaining rows nq
-        # %r8:  ldq in bytes
-
-        movq      %rdi, %r10   # Copy address of q
-        movq      %rsi, %r11   # Copy address of hh
-
-        # set %rax to the address of hh at the end of the loops,
-        # i.e. if %rdx >= %rax we must jump out of the loop.
-        # please note: %rax = 16*%rdx + %rsi
-        movq %rdx, %rax
-        addq %rax, %rax
-        addq %rax, %rax
-        addq %rax, %rax
-        addq %rax, %rax
-        addq %rsi, %rax
-
-#   x1 = q(1,1); y1 = 0
-#   x2 = q(2,1); y2 = 0
-#   ...
-
-        movaps      (%r10), %xmm0
-        movaps    16(%r10), %xmm1
-        xorps     %xmm6, %xmm6
-        xorps     %xmm7, %xmm7
-        .if \nrows>=4
-        movaps    32(%r10), %xmm2
-        movaps    48(%r10), %xmm3
-        xorps     %xmm8, %xmm8
-        xorps     %xmm9, %xmm9
-        .if \nrows==6
-        movaps    64(%r10), %xmm4
-        movaps    80(%r10), %xmm5
-        xorps     %xmm10, %xmm10
-        xorps     %xmm11, %xmm11
-        .endif
-        .endif
-
-#   do i=2,nb
-#      h1 = conjg(hh(i))
-#      x1 = x1 + q(1,i)*h1
-#      x2 = x2 + q(2,i)*h1
-#      ...
-#   enddo
-
-        addq      $16, %r11  # %r11 => hh(2)
-        .align 16
-1:
-        cmpq      %rax, %r11      # Jump out of the loop if %r11 >= %rax
-        jge 2f
-
-        addq      %r8, %r10       # %r10 => q(.,i)
-
-        movddup    (%r11), %xmm14 # real(hh(i))
-        movddup   8(%r11), %xmm15 # imag(hh(i))
-
-        .macro mac_loop1 qoff, X, Y
-        movaps    \qoff(%r10), %xmm13     # q(.,i)
-        movaps    %xmm13, %xmm12
-        mulpd     %xmm14, %xmm13          # q(.,i)*real(hh(i))
-        addpd     %xmm13, \X              # x1 = x1 + q(.,i)*real(hh(i))
-        mulpd     %xmm15, %xmm12          # q(.,i)*imag(hh(i))
-        addsubpd  %xmm12, \Y              # y1 = y1 -/+ q(.,i)*imag(hh(i))
-        .endm
-
-        mac_loop1   0, %xmm0, %xmm6
-        mac_loop1  16, %xmm1, %xmm7
-        .if \nrows>=4
-        mac_loop1  32, %xmm2, %xmm8
-        mac_loop1  48, %xmm3, %xmm9
-        .if \nrows==6
-        mac_loop1  64, %xmm4, %xmm10
-        mac_loop1  80, %xmm5, %xmm11
-        .endif
-        .endif
-
-        .purgem   mac_loop1
-
-        addq      $16, %r11                # %r11 => hh(i+1)
-        jmp       1b
-2:
-
-        # Now the content of the yn has to be swapped and added to xn
-        .macro mac_post_loop_1 X, Y
-        shufpd $1, \Y, \Y
-        addpd  \Y, \X
-        .endm
-
-        mac_post_loop_1  %xmm0, %xmm6
-        mac_post_loop_1  %xmm1, %xmm7
-        .if \nrows>=4
-        mac_post_loop_1  %xmm2, %xmm8
-        mac_post_loop_1  %xmm3, %xmm9
-        .if \nrows==6
-        mac_post_loop_1  %xmm4, %xmm10
-        mac_post_loop_1  %xmm5, %xmm11
-        .endif
-        .endif
-        .purgem   mac_post_loop_1
-
-#   tau1 = hh(1)
-#
-#   h1 = -tau1
-#   x1 = x1*h1; y1 = x1 with halfes exchanged
-#   x2 = x2*h1; y2 = x2 with halfes exchanged
-#   ...
-
-        movq      %rsi, %r11      # restore address of hh
-
-        xorps     %xmm14, %xmm14
-        movddup    (%r11), %xmm12 # real(hh(1))
-        subpd     %xmm12, %xmm14  #-real(hh(1))
-        xorps     %xmm15, %xmm15
-        movddup   8(%r11), %xmm12 # imag(hh(1))
-        subpd     %xmm12, %xmm15  #-imag(hh(1))
-
-        .macro mac_xform X, Y
-        movaps    \X, %xmm12
-        shufpd    $1, \X, %xmm12
-        mulpd     %xmm15, %xmm12
-        mulpd     %xmm14, \X
-        addsubpd  %xmm12, \X
-        movaps    \X, \Y          # copy to y
-        shufpd    $1, \X, \Y      # exchange halfes
-        .endm
-
-        mac_xform %xmm0, %xmm6
-        mac_xform %xmm1, %xmm7
-        .if \nrows>=4
-        mac_xform %xmm2, %xmm8
-        mac_xform %xmm3, %xmm9
-        .if \nrows==6
-        mac_xform %xmm4, %xmm10
-        mac_xform %xmm5, %xmm11
-        .endif
-        .endif
-        .purgem mac_xform
-
-#   q(1,1) = q(1,1) + x1
-#   q(2,1) = q(2,1) + x2
-#   ...
-
-        movq      %rdi, %r10      # restore address of q
-        .macro mac_pre_loop2 qoff, X
-        movaps    \qoff(%r10), %xmm13     # q(.,1)
-        addpd     \X, %xmm13
-        movaps    %xmm13, \qoff(%r10)
-        .endm
-
-        mac_pre_loop2   0, %xmm0
-        mac_pre_loop2  16, %xmm1
-        .if \nrows>=4
-        mac_pre_loop2  32, %xmm2
-        mac_pre_loop2  48, %xmm3
-        .if \nrows==6
-        mac_pre_loop2  64, %xmm4
-        mac_pre_loop2  80, %xmm5
-        .endif
-        .endif
-        .purgem mac_pre_loop2
-
-#   do i=2,nb
-#      h1 = hh(i)
-#      q(1,i) = q(1,i) + x1*h1
-#      q(2,i) = q(2,i) + x2*h1
-#      ...
-#   enddo
-
-        addq      $16, %r11
-        .align 16
-1:
-        cmpq      %rax, %r11      # Jump out of the loop if %r11 >= %rax
-        jge 2f
-
-        addq      %r8, %r10       # %r10 => q(.,i)
-
-        movddup    (%r11), %xmm14 # real(hh(i))
-        movddup   8(%r11), %xmm15 # imag(hh(i))
-
-        .macro mac_loop2 qoff, X, Y
-        movaps    \X, %xmm13
-        mulpd     %xmm14, %xmm13
-        movaps    \Y, %xmm12
-        mulpd     %xmm15, %xmm12
-        addsubpd  %xmm12, %xmm13
-        addpd     \qoff(%r10), %xmm13
-        movaps    %xmm13, \qoff(%r10)
-        .endm
-
-        mac_loop2   0, %xmm0, %xmm6
-        mac_loop2  16, %xmm1, %xmm7
-        .if \nrows>=4
-        mac_loop2  32, %xmm2, %xmm8
-        mac_loop2  48, %xmm3, %xmm9
-        .if \nrows==6
-        mac_loop2  64, %xmm4, %xmm10
-        mac_loop2  80, %xmm5, %xmm11
-        .endif
-        .endif
-        .purgem   mac_loop2
-
-        addq      $16, %r11
-        jmp       1b
-2:
-        .endm
-
-#-------------------------------------------------------------------------------
-#-------------------------------------------------------------------------------
-# FORTRAN Interface:
-#
-# subroutine single_hh_trafo_complex(q, hh, nb, nq, ldq)
-#
-#   integer, intent(in) :: nb, nq, ldq
-#   complex*16, intent(inout) :: q(ldq,*)
-#   complex*16, intent(in) :: hh(*)
-#
-# Parameter mapping to registers
-#   parameter 1: %rdi : q
-#   parameter 2: %rsi : hh
-#   parameter 3: %rdx : nb
-#   parameter 4: %rcx : nq
-#   parameter 5: %r8  : ldq
-#
-#-------------------------------------------------------------------------------
-        .align    16,0x90
-single_hh_trafo_complex_:
-
-        # Get integer parameters into corresponding registers
-
-        movslq    (%rdx), %rdx # nb
-        movslq    (%rcx), %rcx # nq
-        movslq    (%r8),  %r8  # ldq
-
-        # Get ldq in bytes
-        addq      %r8, %r8
-        addq      %r8, %r8
-        addq      %r8, %r8
-        addq      %r8, %r8 # 16*ldq, i.e. ldq in bytes
-
-cloop_s:
-        cmpq      $4, %rcx   # if %rcx <= 4 jump out of loop
-        jle       cloop_e
-        hh_trafo_complex 6 # transform 6 rows
-        addq      $96, %rdi  # increment q start adress by 96 bytes (6 rows)
-        subq      $6,  %rcx  # decrement nq
-        jmp       cloop_s
-cloop_e:
-
-        cmpq      $2, %rcx   # if %rcx <= 2 jump to test_2
-        jle       test_2
-        hh_trafo_complex 4 # transform 4 rows
-        jmp       return2
-
-test_2:
-        cmpq      $0, %rcx   # if %rcx <= 0 jump to return
-        jle       return2
-        hh_trafo_complex 2 # transform 2 rows
-
-return2:
-        ret
-
-        .align    16,0x90
-#-------------------------------------------------------------------------------
-#-------------------------------------------------------------------------------
-#-------------------------------------------------------------------------------
-
-# Declare that we do not need an executable stack here
-	.section	.note.GNU-stack,"",@progbits
diff --git a/src/elpa2_kernels/elpa2_kernels_complex.F90 b/src/elpa2_kernels/elpa2_kernels_complex.F90
deleted file mode 100644
index 02efee2bc..000000000
--- a/src/elpa2_kernels/elpa2_kernels_complex.F90
+++ /dev/null
@@ -1,888 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! --------------------------------------------------------------------------------------------------
-!
-! This file contains the compute intensive kernels for the Householder transformations.
-! It should be compiled with the highest possible optimization level.
-!
-! On Intel use -O3 -xSSE4.2 (or the SSE level fitting to your CPU)
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-! --------------------------------------------------------------------------------------------------
-
-#include "config-f90.h"
-
-module complex_generic_kernel
-
-  private
-  public single_hh_trafo_complex_generic
-contains
-  subroutine single_hh_trafo_complex_generic(q, hh, nb, nq, ldq)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, nq, ldq
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(*)
-#else
-    complex(kind=ck), intent(inout) :: q(1:ldq,1:nb)
-    complex(kind=ck), intent(in)    :: hh(1:nb)
-#endif
-
-    integer(kind=ik)                :: i
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("kernel generic: single_hh_trafo_complex_generic")
-#endif
-
-    ! Safety only:
-
-    if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
-
-    ! Do the Householder transformations
-
-    ! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
-
-    do i=1,nq-8,12
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-       call hh_trafo_complex_kernel_12(q(i,1),hh, nb, ldq)
-#else
-       call hh_trafo_complex_kernel_12(q(i:ldq,1:nb),hh(1:nb), nb, ldq)
-#endif
-    enddo
-
-    ! i > nq-8 now, i.e. at most 8 rows remain
-
-    if(nq-i+1 > 4) then
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-       call hh_trafo_complex_kernel_8(q(i,1),hh, nb, ldq)
-#else
-       call hh_trafo_complex_kernel_8(q(i:ldq,1:nb),hh(1:nb), nb, ldq)
-#endif
-    else if(nq-i+1 > 0) then
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-       call hh_trafo_complex_kernel_4(q(i,1),hh, nb, ldq)
-#else
-       call hh_trafo_complex_kernel_4(q(i:ldq,1:nb),hh(1:nb), nb, ldq)
-#endif
-    endif
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("kernel generic: single_hh_trafo_complex_generic")
-#endif
-  end subroutine single_hh_trafo_complex_generic
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine double_hh_trafo_complex_generic(q, hh, nb, nq, ldq, ldh)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, nq, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(ldh,*)
-#else
-    complex(kind=ck), intent(inout) :: q(1:ldq,1:nb+1)
-    complex(kind=ck), intent(in)    :: hh(1:ldh,1:2)
-#endif
-    complex(kind=ck)                :: s
-
-    integer(kind=ik)                :: i
-
-    ! Safety only:
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("kernel generic: double_hh_trafo_complex_generic")
-#endif
-
-    if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
-
-    ! Calculate dot product of the two Householder vectors
-
-    s = conjg(hh(2,2)*1)
-    do i=3,nb
-       s = s+(conjg(hh(i,2))*hh(i-1,1))
-    enddo
-
-    ! Do the Householder transformations
-
-    ! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
-
-    do i=1,nq,4
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-       call hh_trafo_complex_kernel_4_2hv(q(i,1),hh, nb, ldq, ldh, s)
-#else
-       call hh_trafo_complex_kernel_4_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
-#endif
-    enddo
-
-    !do i=1,nq-8,12
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    !   call hh_trafo_complex_kernel_12_2hv(q(i,1),hh, nb, ldq, ldh, s)
-#else
-    !   call hh_trafo_complex_kernel_12_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
-#endif
-    !enddo
-
-    ! i > nq-8 now, i.e. at most 8 rows remain
-
-    !if(nq-i+1 > 4) then
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    !   call hh_trafo_complex_kernel_8_2hv(q(i,1),hh, nb, ldq, ldh, s)
-#else
-    !   call hh_trafo_complex_kernel_8_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
-#endif
-    !else if(nq-i+1 > 0) then
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    !   call hh_trafo_complex_kernel_4_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
-#else
-
-#endif
-    !endif
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("kernel generic: double_hh_trafo_complex_generic")
-#endif
-
-  end subroutine double_hh_trafo_complex_generic
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, ldq
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(*)
-#else
-    complex(kind=ck), intent(inout) :: q(:,:)
-    complex(kind=ck), intent(in)    :: hh(1:nb)
-#endif
-    complex(kind=ck)                :: x1, x2, x3, x4, x5, x6, x7, x8, x9, xa, xb, xc
-    complex(kind=ck)                :: h1, tau1
-    integer(kind=ik)                :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("kernel generic: hh_trafo_complex_kernel_12")
-#endif
-
-    x1 = q(1,1)
-    x2 = q(2,1)
-    x3 = q(3,1)
-    x4 = q(4,1)
-    x5 = q(5,1)
-    x6 = q(6,1)
-    x7 = q(7,1)
-    x8 = q(8,1)
-    x9 = q(9,1)
-    xa = q(10,1)
-    xb = q(11,1)
-    xc = q(12,1)
-
-    !DEC$ VECTOR ALIGNED
-    do i=2,nb
-       h1 = conjg(hh(i))
-       x1 = x1 + q(1,i)*h1
-       x2 = x2 + q(2,i)*h1
-       x3 = x3 + q(3,i)*h1
-       x4 = x4 + q(4,i)*h1
-       x5 = x5 + q(5,i)*h1
-       x6 = x6 + q(6,i)*h1
-       x7 = x7 + q(7,i)*h1
-       x8 = x8 + q(8,i)*h1
-       x9 = x9 + q(9,i)*h1
-       xa = xa + q(10,i)*h1
-       xb = xb + q(11,i)*h1
-       xc = xc + q(12,i)*h1
-    enddo
-
-    tau1 = hh(1)
-
-    h1 = -tau1
-    x1 = x1*h1
-    x2 = x2*h1
-    x3 = x3*h1
-    x4 = x4*h1
-    x5 = x5*h1
-    x6 = x6*h1
-    x7 = x7*h1
-    x8 = x8*h1
-    x9 = x9*h1
-    xa = xa*h1
-    xb = xb*h1
-    xc = xc*h1
-
-    q(1,1) = q(1,1) + x1
-    q(2,1) = q(2,1) + x2
-    q(3,1) = q(3,1) + x3
-    q(4,1) = q(4,1) + x4
-    q(5,1) = q(5,1) + x5
-    q(6,1) = q(6,1) + x6
-    q(7,1) = q(7,1) + x7
-    q(8,1) = q(8,1) + x8
-    q(9,1) = q(9,1) + x9
-    q(10,1) = q(10,1) + xa
-    q(11,1) = q(11,1) + xb
-    q(12,1) = q(12,1) + xc
-
-    !DEC$ VECTOR ALIGNED
-    do i=2,nb
-       h1 = hh(i)
-       q(1,i) = q(1,i) + x1*h1
-       q(2,i) = q(2,i) + x2*h1
-       q(3,i) = q(3,i) + x3*h1
-       q(4,i) = q(4,i) + x4*h1
-       q(5,i) = q(5,i) + x5*h1
-       q(6,i) = q(6,i) + x6*h1
-       q(7,i) = q(7,i) + x7*h1
-       q(8,i) = q(8,i) + x8*h1
-       q(9,i) = q(9,i) + x9*h1
-       q(10,i) = q(10,i) + xa*h1
-       q(11,i) = q(11,i) + xb*h1
-       q(12,i) = q(12,i) + xc*h1
-    enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("kernel generic: hh_trafo_complex_kernel_12")
-#endif
-
-  end subroutine hh_trafo_complex_kernel_12
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, ldq
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(*)
-#else
-    complex(kind=ck), intent(inout) :: q(:,:)
-    complex(kind=ck), intent(in)    :: hh(1:nb)
-#endif
-    complex(kind=ck)                :: x1, x2, x3, x4, x5, x6, x7, x8
-    complex(kind=ck)                :: h1, tau1
-    integer(kind=ik)                :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("kernel generic: hh_trafo_complex_kernel_8")
-#endif
-
-    x1 = q(1,1)
-    x2 = q(2,1)
-    x3 = q(3,1)
-    x4 = q(4,1)
-    x5 = q(5,1)
-    x6 = q(6,1)
-    x7 = q(7,1)
-    x8 = q(8,1)
-
-    !DEC$ VECTOR ALIGNED
-    do i=2,nb
-       h1 = conjg(hh(i))
-       x1 = x1 + q(1,i)*h1
-       x2 = x2 + q(2,i)*h1
-       x3 = x3 + q(3,i)*h1
-       x4 = x4 + q(4,i)*h1
-       x5 = x5 + q(5,i)*h1
-       x6 = x6 + q(6,i)*h1
-       x7 = x7 + q(7,i)*h1
-       x8 = x8 + q(8,i)*h1
-    enddo
-
-    tau1 = hh(1)
-
-    h1 = -tau1
-    x1 = x1*h1
-    x2 = x2*h1
-    x3 = x3*h1
-    x4 = x4*h1
-    x5 = x5*h1
-    x6 = x6*h1
-    x7 = x7*h1
-    x8 = x8*h1
-
-    q(1,1) = q(1,1) + x1
-    q(2,1) = q(2,1) + x2
-    q(3,1) = q(3,1) + x3
-    q(4,1) = q(4,1) + x4
-    q(5,1) = q(5,1) + x5
-    q(6,1) = q(6,1) + x6
-    q(7,1) = q(7,1) + x7
-    q(8,1) = q(8,1) + x8
-
-    !DEC$ VECTOR ALIGNED
-    do i=2,nb
-       h1 = hh(i)
-       q(1,i) = q(1,i) + x1*h1
-       q(2,i) = q(2,i) + x2*h1
-       q(3,i) = q(3,i) + x3*h1
-       q(4,i) = q(4,i) + x4*h1
-       q(5,i) = q(5,i) + x5*h1
-       q(6,i) = q(6,i) + x6*h1
-       q(7,i) = q(7,i) + x7*h1
-       q(8,i) = q(8,i) + x8*h1
-    enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("kernel generic: hh_trafo_complex_kernel_8")
-#endif
-  end subroutine hh_trafo_complex_kernel_8
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_complex_kernel_4(q, hh, nb, ldq)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, ldq
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(*)
-#else
-    complex(kind=ck), intent(inout) :: q(:,:)
-    complex(kind=ck), intent(in)    :: hh(1:nb)
-#endif
-    complex(kind=ck)                :: x1, x2, x3, x4
-    complex(kind=ck)                :: h1, tau1
-    integer(kind=ik)                :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("kernel generic: hh_trafo_complex_kernel_4")
-#endif
-    x1 = q(1,1)
-    x2 = q(2,1)
-    x3 = q(3,1)
-    x4 = q(4,1)
-
-    !DEC$ VECTOR ALIGNED
-    do i=2,nb
-       h1 = conjg(hh(i))
-       x1 = x1 + q(1,i)*h1
-       x2 = x2 + q(2,i)*h1
-       x3 = x3 + q(3,i)*h1
-       x4 = x4 + q(4,i)*h1
-    enddo
-
-    tau1 = hh(1)
-
-    h1 = -tau1
-    x1 = x1*h1
-    x2 = x2*h1
-    x3 = x3*h1
-    x4 = x4*h1
-
-    q(1,1) = q(1,1) + x1
-    q(2,1) = q(2,1) + x2
-    q(3,1) = q(3,1) + x3
-    q(4,1) = q(4,1) + x4
-
-    !DEC$ VECTOR ALIGNED
-    do i=2,nb
-       h1 = hh(i)
-       q(1,i) = q(1,i) + x1*h1
-       q(2,i) = q(2,i) + x2*h1
-       q(3,i) = q(3,i) + x3*h1
-       q(4,i) = q(4,i) + x4*h1
-    enddo
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("kernel generic: hh_trafo_complex_kernel_4")
-#endif
-
-  end subroutine hh_trafo_complex_kernel_4
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_complex_kernel_4_2hv(q, hh, nb, ldq, ldh, s)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(ldh,*)
-#else
-    complex(kind=ck), intent(inout) :: q(:,:)
-    complex(kind=ck), intent(in)    :: hh(1:ldh,1:2)
-#endif
-    complex(kind=ck), intent(in)    :: s
-
-    complex(kind=ck)                :: x1, x2, x3, x4, y1, y2, y3, y4
-    complex(kind=ck)                :: h1, h2, tau1, tau2
-    integer(kind=ik)                :: i
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("kernel generic: hh_trafo_complex_kernel_4_2hv")
-#endif
-    x1 = q(1,2)
-    x2 = q(2,2)
-    x3 = q(3,2)
-    x4 = q(4,2)
-
-    y1 = q(1,1) + q(1,2)*conjg(hh(2,2))
-    y2 = q(2,1) + q(2,2)*conjg(hh(2,2))
-    y3 = q(3,1) + q(3,2)*conjg(hh(2,2))
-    y4 = q(4,1) + q(4,2)*conjg(hh(2,2))
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = conjg(hh(i-1,1))
-       h2 = conjg(hh(i,2))
-       x1 = x1 + q(1,i)*h1
-       y1 = y1 + q(1,i)*h2
-       x2 = x2 + q(2,i)*h1
-       y2 = y2 + q(2,i)*h2
-       x3 = x3 + q(3,i)*h1
-       y3 = y3 + q(3,i)*h2
-       x4 = x4 + q(4,i)*h1
-       y4 = y4 + q(4,i)*h2
-    enddo
-
-    x1 = x1 + q(1,nb+1)*conjg(hh(nb,1))
-    x2 = x2 + q(2,nb+1)*conjg(hh(nb,1))
-    x3 = x3 + q(3,nb+1)*conjg(hh(nb,1))
-    x4 = x4 + q(4,nb+1)*conjg(hh(nb,1))
-
-    tau1 = hh(1,1)
-    tau2 = hh(1,2)
-
-    h1 = -tau1
-    x1 = x1*h1
-    x2 = x2*h1
-    x3 = x3*h1
-    x4 = x4*h1
-    h1 = -tau2
-    h2 = -tau2*s
-    y1 = y1*h1 + x1*h2
-    y2 = y2*h1 + x2*h2
-    y3 = y3*h1 + x3*h2
-    y4 = y4*h1 + x4*h2
-
-    q(1,1) = q(1,1) + y1
-    q(2,1) = q(2,1) + y2
-    q(3,1) = q(3,1) + y3
-    q(4,1) = q(4,1) + y4
-
-    q(1,2) = q(1,2) + x1 + y1*hh(2,2)
-    q(2,2) = q(2,2) + x2 + y2*hh(2,2)
-    q(3,2) = q(3,2) + x3 + y3*hh(2,2)
-    q(4,2) = q(4,2) + x4 + y4*hh(2,2)
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       q(1,i) = q(1,i) + x1*h1 + y1*h2
-       q(2,i) = q(2,i) + x2*h1 + y2*h2
-       q(3,i) = q(3,i) + x3*h1 + y3*h2
-       q(4,i) = q(4,i) + x4*h1 + y4*h2
-    enddo
-
-    q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
-    q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
-    q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
-    q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("kernel generic: hh_trafo_complex_kernel_4_2hv")
-#endif
-
-  end subroutine hh_trafo_complex_kernel_4_2hv
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_complex_kernel_8_2hv(q, hh, nb, ldq, ldh, s)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(ldh,*)
-#else
-    complex(kind=ck), intent(inout) :: q(:,:)
-    complex(kind=ck), intent(in)    :: hh(1:ldh,1:2)
-#endif
-    complex(kind=ck), intent(in)    :: s
-
-    complex(kind=ck)                :: x1, x2, x3, x4, x5, x6 ,x7, x8, y1, y2, y3, y4, y5, y6, y7, y8
-    complex(kind=ck)                :: h1, h2, tau1, tau2
-    integer(kind=ik)                :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("kernel generic: hh_trafo_complex_kernel_8_2hv")
-#endif
-
-    x1 = q(1,2)
-    x2 = q(2,2)
-    x3 = q(3,2)
-    x4 = q(4,2)
-    x5 = q(5,2)
-    x6 = q(6,2)
-    x7 = q(7,2)
-    x8 = q(8,2)
-
-    y1 = q(1,1) + q(1,2)*conjg(hh(2,2))
-    y2 = q(2,1) + q(2,2)*conjg(hh(2,2))
-    y3 = q(3,1) + q(3,2)*conjg(hh(2,2))
-    y4 = q(4,1) + q(4,2)*conjg(hh(2,2))
-    y5 = q(5,1) + q(5,2)*conjg(hh(2,2))
-    y6 = q(6,1) + q(6,2)*conjg(hh(2,2))
-    y7 = q(7,1) + q(7,2)*conjg(hh(2,2))
-    y8 = q(8,1) + q(8,2)*conjg(hh(2,2))
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = conjg(hh(i-1,1))
-       h2 = conjg(hh(i,2))
-       x1 = x1 + q(1,i)*h1
-       y1 = y1 + q(1,i)*h2
-       x2 = x2 + q(2,i)*h1
-       y2 = y2 + q(2,i)*h2
-       x3 = x3 + q(3,i)*h1
-       y3 = y3 + q(3,i)*h2
-       x4 = x4 + q(4,i)*h1
-       y4 = y4 + q(4,i)*h2
-       x5 = x5 + q(5,i)*h1
-       y5 = y5 + q(5,i)*h2
-       x6 = x6 + q(6,i)*h1
-       y6 = y6 + q(6,i)*h2
-       x7 = x7 + q(7,i)*h1
-       y7 = y7 + q(7,i)*h2
-       x8 = x8 + q(8,i)*h1
-       y8 = y8 + q(8,i)*h2
-    enddo
-
-    x1 = x1 + q(1,nb+1)*conjg(hh(nb,1))
-    x2 = x2 + q(2,nb+1)*conjg(hh(nb,1))
-    x3 = x3 + q(3,nb+1)*conjg(hh(nb,1))
-    x4 = x4 + q(4,nb+1)*conjg(hh(nb,1))
-    x5 = x5 + q(5,nb+1)*conjg(hh(nb,1))
-    x6 = x6 + q(6,nb+1)*conjg(hh(nb,1))
-    x7 = x7 + q(7,nb+1)*conjg(hh(nb,1))
-    x8 = x8 + q(8,nb+1)*conjg(hh(nb,1))
-
-    tau1 = hh(1,1)
-    tau2 = hh(1,2)
-
-    h1 = -tau1
-    x1 = x1*h1
-    x2 = x2*h1
-    x3 = x3*h1
-    x4 = x4*h1
-    x5 = x5*h1
-    x6 = x6*h1
-    x7 = x7*h1
-    x8 = x8*h1
-
-    h1 = -tau2
-    h2 = -tau2*s
-    y1 = y1*h1 + x1*h2
-    y2 = y2*h1 + x2*h2
-    y3 = y3*h1 + x3*h2
-    y4 = y4*h1 + x4*h2
-    y5 = y5*h1 + x5*h2
-    y6 = y6*h1 + x6*h2
-    y7 = y7*h1 + x7*h2
-    y8 = y8*h1 + x8*h2
-
-    q(1,1) = q(1,1) + y1
-    q(2,1) = q(2,1) + y2
-    q(3,1) = q(3,1) + y3
-    q(4,1) = q(4,1) + y4
-    q(5,1) = q(5,1) + y5
-    q(6,1) = q(6,1) + y6
-    q(7,1) = q(7,1) + y7
-    q(8,1) = q(8,1) + y8
-
-    q(1,2) = q(1,2) + x1 + y1*hh(2,2)
-    q(2,2) = q(2,2) + x2 + y2*hh(2,2)
-    q(3,2) = q(3,2) + x3 + y3*hh(2,2)
-    q(4,2) = q(4,2) + x4 + y4*hh(2,2)
-    q(5,2) = q(5,2) + x5 + y5*hh(2,2)
-    q(6,2) = q(6,2) + x6 + y6*hh(2,2)
-    q(7,2) = q(7,2) + x7 + y7*hh(2,2)
-    q(8,2) = q(8,2) + x8 + y8*hh(2,2)
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       q(1,i) = q(1,i) + x1*h1 + y1*h2
-       q(2,i) = q(2,i) + x2*h1 + y2*h2
-       q(3,i) = q(3,i) + x3*h1 + y3*h2
-       q(4,i) = q(4,i) + x4*h1 + y4*h2
-       q(5,i) = q(5,i) + x5*h1 + y5*h2
-       q(6,i) = q(6,i) + x6*h1 + y6*h2
-       q(7,i) = q(7,i) + x7*h1 + y7*h2
-       q(8,i) = q(8,i) + x8*h1 + y8*h2
-    enddo
-
-    q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
-    q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
-    q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
-    q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
-    q(5,nb+1) = q(5,nb+1) + x5*hh(nb,1)
-    q(6,nb+1) = q(6,nb+1) + x6*hh(nb,1)
-    q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1)
-    q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("kernel generic: hh_trafo_complex_kernel_8_2hv")
-#endif
-
-  end subroutine hh_trafo_complex_kernel_8_2hv
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_complex_kernel_12_2hv(q, hh, nb, ldq, ldh, s)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(ldh,*)
-#else
-    complex(kind=ck), intent(inout) :: q(:,:)
-    complex(kind=ck), intent(in)    :: hh(1:ldh,1:2)
-#endif
-    complex(kind=ck), intent(in)    :: s
-
-    complex(kind=ck)                :: x1, x2, x3, x4, x5, x6 ,x7, x8, x9, x10, x11, x12, y1, y2, y3, y4, y5, y6, &
-                                       y7, y8, y9, y10, y11, y12
-    complex(kind=ck)                :: h1, h2, tau1, tau2
-    integer(kind=ik)                :: i
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("kernel generic: hh_trafo_complex_kernel_12_2hv")
-#endif
-    x1 = q(1,2)
-    x2 = q(2,2)
-    x3 = q(3,2)
-    x4 = q(4,2)
-    x5 = q(5,2)
-    x6 = q(6,2)
-    x7 = q(7,2)
-    x8 = q(8,2)
-    x9 = q(9,2)
-    x10 = q(10,2)
-    x11 = q(11,2)
-    x12 = q(12,2)
-
-    y1 = q(1,1) + q(1,2)*conjg(hh(2,2))
-    y2 = q(2,1) + q(2,2)*conjg(hh(2,2))
-    y3 = q(3,1) + q(3,2)*conjg(hh(2,2))
-    y4 = q(4,1) + q(4,2)*conjg(hh(2,2))
-    y5 = q(5,1) + q(5,2)*conjg(hh(2,2))
-    y6 = q(6,1) + q(6,2)*conjg(hh(2,2))
-    y7 = q(7,1) + q(7,2)*conjg(hh(2,2))
-    y8 = q(8,1) + q(8,2)*conjg(hh(2,2))
-    y9 = q(9,1) + q(9,2)*conjg(hh(2,2))
-    y10 = q(10,1) + q(10,2)*conjg(hh(2,2))
-    y11 = q(11,1) + q(11,2)*conjg(hh(2,2))
-    y12 = q(12,1) + q(12,2)*conjg(hh(2,2))
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = conjg(hh(i-1,1))
-       h2 = conjg(hh(i,2))
-       x1 = x1 + q(1,i)*h1
-       y1 = y1 + q(1,i)*h2
-       x2 = x2 + q(2,i)*h1
-       y2 = y2 + q(2,i)*h2
-       x3 = x3 + q(3,i)*h1
-       y3 = y3 + q(3,i)*h2
-       x4 = x4 + q(4,i)*h1
-       y4 = y4 + q(4,i)*h2
-       x5 = x5 + q(5,i)*h1
-       y5 = y5 + q(5,i)*h2
-       x6 = x6 + q(6,i)*h1
-       y6 = y6 + q(6,i)*h2
-       x7 = x7 + q(7,i)*h1
-       y7 = y7 + q(7,i)*h2
-       x8 = x8 + q(8,i)*h1
-       y8 = y8 + q(8,i)*h2
-       x9 = x9 + q(9,i)*h1
-       y9 = y9 + q(9,i)*h2
-       x10 = x10 + q(10,i)*h1
-       y10 = y10 + q(10,i)*h2
-       x11 = x11 + q(11,i)*h1
-       y11 = y11 + q(11,i)*h2
-       x12 = x12 + q(12,i)*h1
-       y12 = y12 + q(12,i)*h2
-    enddo
-
-    x1 = x1 + q(1,nb+1)*conjg(hh(nb,1))
-    x2 = x2 + q(2,nb+1)*conjg(hh(nb,1))
-    x3 = x3 + q(3,nb+1)*conjg(hh(nb,1))
-    x4 = x4 + q(4,nb+1)*conjg(hh(nb,1))
-    x5 = x5 + q(5,nb+1)*conjg(hh(nb,1))
-    x6 = x6 + q(6,nb+1)*conjg(hh(nb,1))
-    x7 = x7 + q(7,nb+1)*conjg(hh(nb,1))
-    x8 = x8 + q(8,nb+1)*conjg(hh(nb,1))
-    x9 = x9 + q(9,nb+1)*conjg(hh(nb,1))
-    x10 = x10 + q(10,nb+1)*conjg(hh(nb,1))
-    x11 = x11 + q(11,nb+1)*conjg(hh(nb,1))
-    x12 = x12 + q(12,nb+1)*conjg(hh(nb,1))
-
-    tau1 = hh(1,1)
-    tau2 = hh(1,2)
-
-    h1 = -tau1
-    x1 = x1*h1
-    x2 = x2*h1
-    x3 = x3*h1
-    x4 = x4*h1
-    x5 = x5*h1
-    x6 = x6*h1
-    x7 = x7*h1
-    x8 = x8*h1
-    x9 = x9*h1
-    x10 = x10*h1
-    x11 = x11*h1
-    x12 = x12*h1
-    h1 = -tau2
-    h2 = -tau2*s
-    y1 = y1*h1 + x1*h2
-    y2 = y2*h1 + x2*h2
-    y3 = y3*h1 + x3*h2
-    y4 = y4*h1 + x4*h2
-    y5 = y5*h1 + x5*h2
-    y6 = y6*h1 + x6*h2
-    y7 = y7*h1 + x7*h2
-    y8 = y8*h1 + x8*h2
-    y9 = y9*h1 + x9*h2
-    y10 = y10*h1 + x10*h2
-    y11 = y11*h1 + x11*h2
-    y12 = y12*h1 + x12*h2
-
-    q(1,1) = q(1,1) + y1
-    q(2,1) = q(2,1) + y2
-    q(3,1) = q(3,1) + y3
-    q(4,1) = q(4,1) + y4
-    q(5,1) = q(5,1) + y5
-    q(6,1) = q(6,1) + y6
-    q(7,1) = q(7,1) + y7
-    q(8,1) = q(8,1) + y8
-    q(9,1) = q(9,1) + y9
-    q(10,1) = q(10,1) + y10
-    q(11,1) = q(11,1) + y11
-    q(12,1) = q(12,1) + y12
-
-    q(1,2) = q(1,2) + x1 + y1*hh(2,2)
-    q(2,2) = q(2,2) + x2 + y2*hh(2,2)
-    q(3,2) = q(3,2) + x3 + y3*hh(2,2)
-    q(4,2) = q(4,2) + x4 + y4*hh(2,2)
-    q(5,2) = q(5,2) + x5 + y5*hh(2,2)
-    q(6,2) = q(6,2) + x6 + y6*hh(2,2)
-    q(7,2) = q(7,2) + x7 + y7*hh(2,2)
-    q(8,2) = q(8,2) + x8 + y8*hh(2,2)
-    q(9,2) = q(9,2) + x9 + y9*hh(2,2)
-    q(10,2) = q(10,2) + x10 + y10*hh(2,2)
-    q(11,2) = q(11,2) + x11 + y11*hh(2,2)
-    q(12,2) = q(12,2) + x12 + y12*hh(2,2)
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       q(1,i) = q(1,i) + x1*h1 + y1*h2
-       q(2,i) = q(2,i) + x2*h1 + y2*h2
-       q(3,i) = q(3,i) + x3*h1 + y3*h2
-       q(4,i) = q(4,i) + x4*h1 + y4*h2
-       q(5,i) = q(5,i) + x5*h1 + y5*h2
-       q(6,i) = q(6,i) + x6*h1 + y6*h2
-       q(7,i) = q(7,i) + x7*h1 + y7*h2
-       q(8,i) = q(8,i) + x8*h1 + y8*h2
-       q(9,i) = q(9,i) + x9*h1 + y9*h2
-       q(10,i) = q(10,i) + x10*h1 + y10*h2
-       q(11,i) = q(11,i) + x11*h1 + y11*h2
-       q(12,i) = q(12,i) + x12*h1 + y12*h2
-    enddo
-
-    q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
-    q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
-    q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
-    q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
-    q(5,nb+1) = q(5,nb+1) + x5*hh(nb,1)
-    q(6,nb+1) = q(6,nb+1) + x6*hh(nb,1)
-    q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1)
-    q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1)
-    q(9,nb+1) = q(9,nb+1) + x9*hh(nb,1)
-    q(10,nb+1) = q(10,nb+1) + x10*hh(nb,1)
-    q(11,nb+1) = q(11,nb+1) + x11*hh(nb,1)
-    q(12,nb+1) = q(12,nb+1) + x12*hh(nb,1)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("kernel generic: hh_trafo_complex_kernel_12_2hv")
-#endif
-
-  end subroutine hh_trafo_complex_kernel_12_2hv
-end module complex_generic_kernel
-! --------------------------------------------------------------------------------------------------
diff --git a/src/elpa2_kernels/elpa2_kernels_complex_simple.F90 b/src/elpa2_kernels/elpa2_kernels_complex_simple.F90
deleted file mode 100644
index 3d0f62091..000000000
--- a/src/elpa2_kernels/elpa2_kernels_complex_simple.F90
+++ /dev/null
@@ -1,177 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! --------------------------------------------------------------------------------------------------
-!
-! This file contains the compute intensive kernels for the Householder transformations.
-!
-! This is the small and simple version (no hand unrolling of loops etc.) but for some
-! compilers this performs better than a sophisticated version with transformed and unrolled loops.
-!
-! It should be compiled with the highest possible optimization level.
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-! --------------------------------------------------------------------------------------------------
-
-#include "config-f90.h"
-
-module complex_generic_simple_kernel
-
-  private
-  public single_hh_trafo_complex_generic_simple
-contains
-  subroutine single_hh_trafo_complex_generic_simple(q, hh, nb, nq, ldq)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, nq, ldq
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(*)
-#else
-    complex(kind=ck), intent(inout) :: q(1:ldq,1:nb)
-    complex(kind=ck), intent(in)    :: hh(1:nb)
-#endif
-    integer(kind=ik)                :: i
-    complex(kind=ck)                :: h1, tau1, x(nq)
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%start("kernel complex generic simple: single_hh_trafo_complex_generic_simple")
-#endif
-    ! Just one Householder transformation
-
-    x(1:nq) = q(1:nq,1)
-
-    do i=2,nb
-       x(1:nq) = x(1:nq) + q(1:nq,i)*conjg(hh(i))
-    enddo
-
-    tau1 = hh(1)
-    x(1:nq) = x(1:nq)*(-tau1)
-
-    q(1:nq,1) = q(1:nq,1) + x(1:nq)
-
-    do i=2,nb
-       q(1:nq,i) = q(1:nq,i) + x(1:nq)*hh(i)
-    enddo
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%stop("kernel complex generic simple: single_hh_trafo_complex_generic_simple")
-#endif
-  end subroutine single_hh_trafo_complex_generic_simple
-
-  ! --------------------------------------------------------------------------------------------------
-  subroutine double_hh_trafo_complex_generic_simple(q, hh, nb, nq, ldq, ldh)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, nq, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(ldh,*)
-#else
-    complex(kind=ck), intent(inout) :: q(1:ldq,1:nb+1)
-    complex(kind=ck), intent(in)    :: hh(1:ldh,1:2)
-#endif
-    complex(kind=ck)                :: s, h1, h2, tau1, tau2, x(nq), y(nq)
-    integer(kind=ik)                :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%start("kernel complex generic simple: double_hh_trafo_complex_generic_simple")
-#endif
-    ! Calculate dot product of the two Householder vectors
-
-    s = conjg(hh(2,2))*1
-    do i=3,nb
-       s = s+(conjg(hh(i,2))*hh(i-1,1))
-    enddo
-
-    ! Do the Householder transformations
-
-    x(1:nq) = q(1:nq,2)
-
-    y(1:nq) = q(1:nq,1) + q(1:nq,2)*conjg(hh(2,2))
-
-    do i=3,nb
-       h1 = conjg(hh(i-1,1))
-       h2 = conjg(hh(i,2))
-       x(1:nq) = x(1:nq) + q(1:nq,i)*h1
-       y(1:nq) = y(1:nq) + q(1:nq,i)*h2
-    enddo
-
-    x(1:nq) = x(1:nq) + q(1:nq,nb+1)*conjg(hh(nb,1))
-
-    tau1 = hh(1,1)
-    tau2 = hh(1,2)
-
-    h1 = -tau1
-    x(1:nq) = x(1:nq)*h1
-    h1 = -tau2
-    h2 = -tau2*s
-    y(1:nq) = y(1:nq)*h1 + x(1:nq)*h2
-
-    q(1:nq,1) = q(1:nq,1) + y(1:nq)
-    q(1:nq,2) = q(1:nq,2) + x(1:nq) + y(1:nq)*hh(2,2)
-
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       q(1:nq,i) = q(1:nq,i) + x(1:nq)*h1 + y(1:nq)*h2
-    enddo
-
-    q(1:nq,nb+1) = q(1:nq,nb+1) + x(1:nq)*hh(nb,1)
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%stop("kernel complex generic simple: double_hh_trafo_complex_generic_simple")
-#endif
-  end subroutine double_hh_trafo_complex_generic_simple
-end module complex_generic_simple_kernel
-! --------------------------------------------------------------------------------------------------
diff --git a/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp b/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
deleted file mode 100644
index 2337c6585..000000000
--- a/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
+++ /dev/null
@@ -1,1057 +0,0 @@
-//    This file is part of ELPA.
-//
-//    The ELPA library was originally created by the ELPA consortium,
-//    consisting of the following organizations:
-//
-//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-//      Informatik,
-//    - Technische Universität München, Lehrstuhl für Informatik mit
-//      Schwerpunkt Wissenschaftliches Rechnen ,
-//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-//      and
-//    - IBM Deutschland GmbH
-//
-//    This particular source code file contains additions, changes and
-//    enhancements authored by Intel Corporation which is not part of
-//    the ELPA consortium.
-//
-//    More information can be found here:
-//    http://elpa.mpcdf.mpg.de/
-//
-//    ELPA is free software: you can redistribute it and/or modify
-//    it under the terms of the version 3 of the license of the
-//    GNU Lesser General Public License as published by the Free
-//    Software Foundation.
-//
-//    ELPA is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU Lesser General Public License for more details.
-//
-//    You should have received a copy of the GNU Lesser General Public License
-//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-//
-//    ELPA reflects a substantial effort on the part of the original
-//    ELPA consortium, and we ask you to respect the spirit of the
-//    license that we chose: i.e., please contribute any changes you
-//    may have back to the original ELPA library distribution, and keep
-//    any derivatives of ELPA under the same license that we chose for
-//    the original distribution, the GNU Lesser General Public License.
-//
-//
-// --------------------------------------------------------------------------------------------------
-//
-// This file contains the compute intensive kernels for the Householder transformations.
-// It should be compiled with the highest possible optimization level.
-//
-// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
-// On Intel Sandy Bridge use -O3 -mavx
-//
-// Copyright of the original code rests with the authors inside the ELPA
-// consortium. The copyright of any additional modifications shall rest
-// with their original authors, but shall adhere to the licensing terms
-// distributed along with the original code in the file "COPYING".
-//
-// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
-// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
-// --------------------------------------------------------------------------------------------------
-
-#include <complex>
-#include <x86intrin.h>
-
-#define __forceinline __attribute__((always_inline))
-
-#ifdef __USE_AVX128__
-#undef __AVX__
-#endif
-
-#ifdef __FMA4__
-#define __ELPA_USE_FMA__
-#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
-#define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c)
-#endif
-
-#ifdef __AVX2__
-#define __ELPA_USE_FMA__
-#define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c)
-#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
-#endif
-
-extern "C" {
-
-//Forward declaration
-#ifdef __AVX__
-static  __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
-static  __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
-static  __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
-
-#else
-static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
-static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
-static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
-#endif
-
-#if 0
-static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
-{
-	std::complex<double> x0;
-	std::complex<double> x1;
-	std::complex<double> x2;
-	std::complex<double> x3;
-	std::complex<double> h0;
-	std::complex<double> tau0;
-	int i=0;
-
-	x0 = q[0];
-	x1 = q[1];
-	x2 = q[2];
-	x3 = q[3];
-
-	for (i = 1; i < nb; i++)
-	{
-		h0 = conj(hh[i]);
-		x0 += (q[(i*ldq)+0] * h0);
-		x1 += (q[(i*ldq)+1] * h0);
-		x2 += (q[(i*ldq)+2] * h0);
-		x3 += (q[(i*ldq)+3] * h0);
-	}
-
-	tau0 = hh[0];
-
-	h0 = (-1.0)*tau0;
-
-	x0 *= h0;
-	x1 *= h0;
-	x2 *= h0;
-	x3 *= h0;
-
-	q[0] += x0;
-	q[1] += x1;
-	q[2] += x2;
-	q[3] += x3;
-
-	for (i = 1; i < nb; i++)
-	{
-		h0 = hh[i];
-		q[(i*ldq)+0] += (x0*h0);
-		q[(i*ldq)+1] += (x1*h0);
-		q[(i*ldq)+2] += (x2*h0);
-		q[(i*ldq)+3] += (x3*h0);
-	}
-}
-#endif // if 0
-
-void single_hh_trafo_complex_sse_avx_1hv_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	//int ldh = *pldh;
-
-#ifdef __AVX__
-	for (i = 0; i < nq-8; i+=12)
-	{
-		hh_trafo_complex_kernel_12_AVX_1hv(&q[i], hh, nb, ldq);
-	}
-	if (nq-i > 4)
-	{
-		hh_trafo_complex_kernel_8_AVX_1hv(&q[i], hh, nb, ldq);
-	}
-	else if (nq-i > 0)
-	{
-		hh_trafo_complex_kernel_4_AVX_1hv(&q[i], hh, nb, ldq);
-	}
-#else
-	for (i = 0; i < nq-4; i+=6)
-	{
-		hh_trafo_complex_kernel_6_SSE_1hv(&q[i], hh, nb, ldq);
-	}
-	if (nq-i > 2)
-	{
-		hh_trafo_complex_kernel_4_SSE_1hv(&q[i], hh, nb, ldq);
-	}
-	else if (nq-i > 0)
-	{
-		hh_trafo_complex_kernel_2_SSE_1hv(&q[i], hh, nb, ldq);
-	}
-#endif
-}
-
-#ifdef __AVX__
- static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-
-	__m256d x1, x2, x3, x4, x5, x6;
-	__m256d q1, q2, q3, q4, q5, q6;
-	__m256d h1_real, h1_imag;
-	__m256d tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
-	int i=0;
-
-	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm256_load_pd(&q_dbl[0]);
-	x2 = _mm256_load_pd(&q_dbl[4]);
-	x3 = _mm256_load_pd(&q_dbl[8]);
-	x4 = _mm256_load_pd(&q_dbl[12]);
-	x5 = _mm256_load_pd(&q_dbl[16]);
-	x6 = _mm256_load_pd(&q_dbl[20]);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);
-#ifndef __ELPA_USE_FMA__		
-		// conjugate
-		h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
-		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
-		q5 = _mm256_load_pd(&q_dbl[(2*i*ldq)+16]);
-		q6 = _mm256_load_pd(&q_dbl[(2*i*ldq)+20]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-		tmp4 = _mm256_mul_pd(h1_imag, q4);
-#ifdef __ELPA_USE_FMA__
-		x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-		x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-		tmp5 = _mm256_mul_pd(h1_imag, q5);
-#ifdef __ELPA_USE_FMA__
-		x5 = _mm256_add_pd(x5, _mm256_FMSUBADD_pd(h1_real, q5, _mm256_shuffle_pd(tmp5, tmp5, 0x5)));
-#else
-		x5 = _mm256_add_pd(x5, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q5), _mm256_shuffle_pd(tmp5, tmp5, 0x5)));
-#endif
-		tmp6 = _mm256_mul_pd(h1_imag, q6);
-#ifdef __ELPA_USE_FMA__
-		x6 = _mm256_add_pd(x6, _mm256_FMSUBADD_pd(h1_real, q6, _mm256_shuffle_pd(tmp6, tmp6, 0x5)));
-#else
-		x6 = _mm256_add_pd(x6, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q6), _mm256_shuffle_pd(tmp6, tmp6, 0x5)));
-#endif
-	}
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#else
-	x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#else
-	x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#endif
-	tmp4 = _mm256_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	x4 = _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5));
-#else
-	x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5));
-#endif
-	tmp5 = _mm256_mul_pd(h1_imag, x5);
-#ifdef __ELPA_USE_FMA__
-	x5 = _mm256_FMADDSUB_pd(h1_real, x5, _mm256_shuffle_pd(tmp5, tmp5, 0x5));
-#else
-	x5 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x5), _mm256_shuffle_pd(tmp5, tmp5, 0x5));
-#endif
-	tmp6 = _mm256_mul_pd(h1_imag, x6);
-#ifdef __ELPA_USE_FMA__
-	x6 = _mm256_FMADDSUB_pd(h1_real, x6, _mm256_shuffle_pd(tmp6, tmp6, 0x5));
-#else
-	x6 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x6), _mm256_shuffle_pd(tmp6, tmp6, 0x5));
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[0]);
-	q2 = _mm256_load_pd(&q_dbl[4]);
-	q3 = _mm256_load_pd(&q_dbl[8]);
-	q4 = _mm256_load_pd(&q_dbl[12]);
-	q5 = _mm256_load_pd(&q_dbl[16]);
-	q6 = _mm256_load_pd(&q_dbl[20]);
-
-	q1 = _mm256_add_pd(q1, x1);
-	q2 = _mm256_add_pd(q2, x2);
-	q3 = _mm256_add_pd(q3, x3);
-	q4 = _mm256_add_pd(q4, x4);
-	q5 = _mm256_add_pd(q5, x5);
-	q6 = _mm256_add_pd(q6, x6);
-
-	_mm256_store_pd(&q_dbl[0], q1);
-	_mm256_store_pd(&q_dbl[4], q2);
-	_mm256_store_pd(&q_dbl[8], q3);
-	_mm256_store_pd(&q_dbl[12], q4);
-	_mm256_store_pd(&q_dbl[16], q5);
-	_mm256_store_pd(&q_dbl[20], q6);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);
-
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
-		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
-		q5 = _mm256_load_pd(&q_dbl[(2*i*ldq)+16]);
-		q6 = _mm256_load_pd(&q_dbl[(2*i*ldq)+20]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__	
-		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-		tmp4 = _mm256_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-		q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-		q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-		tmp5 = _mm256_mul_pd(h1_imag, x5);
-#ifdef __ELPA_USE_FMA__
-		q5 = _mm256_add_pd(q5, _mm256_FMADDSUB_pd(h1_real, x5, _mm256_shuffle_pd(tmp5, tmp5, 0x5)));
-#else
-		q5 = _mm256_add_pd(q5, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x5), _mm256_shuffle_pd(tmp5, tmp5, 0x5)));
-#endif
-		tmp6 = _mm256_mul_pd(h1_imag, x6);
-#ifdef __ELPA_USE_FMA__
-		q6 = _mm256_add_pd(q6, _mm256_FMADDSUB_pd(h1_real, x6, _mm256_shuffle_pd(tmp6, tmp6, 0x5)));
-#else
-		q6 = _mm256_add_pd(q6, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x6), _mm256_shuffle_pd(tmp6, tmp6, 0x5)));
-#endif
-
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+16], q5);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+20], q6);
-	}
-}
-
-static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-
-	__m256d x1, x2, x3, x4;
-	__m256d q1, q2, q3, q4;
-	__m256d h1_real, h1_imag;
-	__m256d tmp1, tmp2, tmp3, tmp4;
-	int i=0;
-
-	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm256_load_pd(&q_dbl[0]);
-	x2 = _mm256_load_pd(&q_dbl[4]);
-	x3 = _mm256_load_pd(&q_dbl[8]);
-	x4 = _mm256_load_pd(&q_dbl[12]);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);
-#ifndef __ELPA_USE_FMA__		
-		// conjugate
-		h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
-		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-		tmp4 = _mm256_mul_pd(h1_imag, q4);
-#ifdef __ELPA_USE_FMA__
-		x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-		x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-	}
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#else
-	x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#else
-	x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#endif
-	tmp4 = _mm256_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	x4 = _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5));
-#else
-	x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5));
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[0]);
-	q2 = _mm256_load_pd(&q_dbl[4]);
-	q3 = _mm256_load_pd(&q_dbl[8]);
-	q4 = _mm256_load_pd(&q_dbl[12]);
-
-	q1 = _mm256_add_pd(q1, x1);
-	q2 = _mm256_add_pd(q2, x2);
-	q3 = _mm256_add_pd(q3, x3);
-	q4 = _mm256_add_pd(q4, x4);
-
-	_mm256_store_pd(&q_dbl[0], q1);
-	_mm256_store_pd(&q_dbl[4], q2);
-	_mm256_store_pd(&q_dbl[8], q3);
-	_mm256_store_pd(&q_dbl[12], q4);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);
-
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
-		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else	
-		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else		
-		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-		tmp4 = _mm256_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-		q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-		q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4);
-	}
-}
-
-static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-
-	__m256d x1, x2;
-	__m256d q1, q2;
-	__m256d h1_real, h1_imag;
-	__m256d tmp1, tmp2;
-	int i=0;
-
-	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm256_load_pd(&q_dbl[0]);
-	x2 = _mm256_load_pd(&q_dbl[4]);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	}
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#else
-	x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[0]);
-	q2 = _mm256_load_pd(&q_dbl[4]);
-
-	q1 = _mm256_add_pd(q1, x1);
-	q2 = _mm256_add_pd(q2, x2);
-
-	_mm256_store_pd(&q_dbl[0], q1);
-	_mm256_store_pd(&q_dbl[4], q2);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);
-
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
-	}
-}
-
-#else
-static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-
-	__m128d x1, x2, x3, x4, x5, x6;
-	__m128d q1, q2, q3, q4, q5, q6;
-	__m128d h1_real, h1_imag;
-	__m128d tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
-	int i=0;
-
-	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm_load_pd(&q_dbl[0]);
-	x2 = _mm_load_pd(&q_dbl[2]);
-	x3 = _mm_load_pd(&q_dbl[4]);
-	x4 = _mm_load_pd(&q_dbl[6]);
-	x5 = _mm_load_pd(&q_dbl[8]);
-	x6 = _mm_load_pd(&q_dbl[10]);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
-#ifndef __ELPA_USE_FMA__		
-		// conjugate
-		h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
-		q5 = _mm_load_pd(&q_dbl[(2*i*ldq)+8]);
-		q6 = _mm_load_pd(&q_dbl[(2*i*ldq)+10]);
-
-		tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp4 = _mm_mul_pd(h1_imag, q4);
-#ifdef __ELPA_USE_FMA__
-		x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-		x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp5 = _mm_mul_pd(h1_imag, q5);
-#ifdef __ELPA_USE_FMA__
-		x5 = _mm_add_pd(x5, _mm_msubadd_pd(h1_real, q5, _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1))));
-#else
-		x5 = _mm_add_pd(x5, _mm_addsub_pd( _mm_mul_pd(h1_real, q5), _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp6 = _mm_mul_pd(h1_imag, q6);
-#ifdef __ELPA_USE_FMA__
-		x6 = _mm_add_pd(x6, _mm_msubadd_pd(h1_real, q6, _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1))));
-#else
-		x6 = _mm_add_pd(x6, _mm_addsub_pd( _mm_mul_pd(h1_real, q6), _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1))));
-#endif
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[0]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__	
-	x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#else
-	x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#else
-	x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp4 = _mm_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
-#else
-	x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp5 = _mm_mul_pd(h1_imag, x5);
-#ifdef __ELPA_USE_FMA__
-	x5 = _mm_maddsub_pd(h1_real, x5, _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1)));
-#else
-	x5 = _mm_addsub_pd( _mm_mul_pd(h1_real, x5), _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp6 = _mm_mul_pd(h1_imag, x6);
-#ifdef __ELPA_USE_FMA__
-	x6 = _mm_maddsub_pd(h1_real, x6, _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1)));
-#else
-	x6 = _mm_addsub_pd( _mm_mul_pd(h1_real, x6), _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1)));
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[0]);
-	q2 = _mm_load_pd(&q_dbl[2]);
-	q3 = _mm_load_pd(&q_dbl[4]);
-	q4 = _mm_load_pd(&q_dbl[6]);
-	q5 = _mm_load_pd(&q_dbl[8]);
-	q6 = _mm_load_pd(&q_dbl[10]);
-
-	q1 = _mm_add_pd(q1, x1);
-	q2 = _mm_add_pd(q2, x2);
-	q3 = _mm_add_pd(q3, x3);
-	q4 = _mm_add_pd(q4, x4);
-	q5 = _mm_add_pd(q5, x5);
-	q6 = _mm_add_pd(q6, x6);
-
-	_mm_store_pd(&q_dbl[0], q1);
-	_mm_store_pd(&q_dbl[2], q2);
-	_mm_store_pd(&q_dbl[4], q3);
-	_mm_store_pd(&q_dbl[6], q4);
-	_mm_store_pd(&q_dbl[8], q5);
-	_mm_store_pd(&q_dbl[10], q6);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
-
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
-		q5 = _mm_load_pd(&q_dbl[(2*i*ldq)+8]);
-		q6 = _mm_load_pd(&q_dbl[(2*i*ldq)+10]);
-
-		tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp4 = _mm_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-		q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-		q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp5 = _mm_mul_pd(h1_imag, x5);
-#ifdef __ELPA_USE_FMA__
-		q5 = _mm_add_pd(q5, _mm_maddsub_pd(h1_real, x5, _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1))));
-#else
-		q5 = _mm_add_pd(q5, _mm_addsub_pd( _mm_mul_pd(h1_real, x5), _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp6 = _mm_mul_pd(h1_imag, x6);
-#ifdef __ELPA_USE_FMA__
-		q6 = _mm_add_pd(q6, _mm_maddsub_pd(h1_real, x6, _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1))));
-#else
-		q6 = _mm_add_pd(q6, _mm_addsub_pd( _mm_mul_pd(h1_real, x6), _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1))));
-#endif
-
-		_mm_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+2], q2);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+4], q3);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+6], q4);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+8], q5);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+10], q6);
-	}
-}
-
-static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-
-	__m128d x1, x2, x3, x4;
-	__m128d q1, q2, q3, q4;
-	__m128d h1_real, h1_imag;
-	__m128d tmp1, tmp2, tmp3, tmp4;
-	int i=0;
-
-	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm_load_pd(&q_dbl[0]);
-	x2 = _mm_load_pd(&q_dbl[2]);
-	x3 = _mm_load_pd(&q_dbl[4]);
-	x4 = _mm_load_pd(&q_dbl[6]);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
-
-		tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp4 = _mm_mul_pd(h1_imag, q4);
-#ifdef __ELPA_USE_FMA__
-		x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-		x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[0]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#else
-	x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#else
-	x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp4 = _mm_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
-#else
-	x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[0]);
-	q2 = _mm_load_pd(&q_dbl[2]);
-	q3 = _mm_load_pd(&q_dbl[4]);
-	q4 = _mm_load_pd(&q_dbl[6]);
-
-	q1 = _mm_add_pd(q1, x1);
-	q2 = _mm_add_pd(q2, x2);
-	q3 = _mm_add_pd(q3, x3);
-	q4 = _mm_add_pd(q4, x4);
-
-	_mm_store_pd(&q_dbl[0], q1);
-	_mm_store_pd(&q_dbl[2], q2);
-	_mm_store_pd(&q_dbl[4], q3);
-	_mm_store_pd(&q_dbl[6], q4);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
-
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
-
-		tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp4 = _mm_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-		q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-		q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-
-		_mm_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+2], q2);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+4], q3);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+6], q4);
-	}
-}
-
-static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-
-	__m128d x1, x2;
-	__m128d q1, q2;
-	__m128d h1_real, h1_imag;
-	__m128d tmp1, tmp2;
-	int i=0;
-
-	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm_load_pd(&q_dbl[0]);
-	x2 = _mm_load_pd(&q_dbl[2]);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
-#ifndef __ELPA_USE_FMA__	
-		// conjugate
-		h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-
-		tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[0]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#else
-	x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[0]);
-	q2 = _mm_load_pd(&q_dbl[2]);
-
-	q1 = _mm_add_pd(q1, x1);
-	q2 = _mm_add_pd(q2, x2);
-
-	_mm_store_pd(&q_dbl[0], q1);
-	_mm_store_pd(&q_dbl[2], q2);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
-
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-
-		tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-
-		_mm_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+2], q2);
-	}
-}
-#endif
-} // extern C
diff --git a/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp b/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
deleted file mode 100644
index 3763e5d99..000000000
--- a/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
+++ /dev/null
@@ -1,2763 +0,0 @@
-//    This file is part of ELPA.
-//
-//    The ELPA library was originally created by the ELPA consortium,
-//    consisting of the following organizations:
-//
-//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-//      Informatik,
-//    - Technische Universität München, Lehrstuhl für Informatik mit
-//      Schwerpunkt Wissenschaftliches Rechnen ,
-//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-//      and
-//    - IBM Deutschland GmbH
-//
-//    This particular source code file contains additions, changes and
-//    enhancements authored by Intel Corporation which is not part of
-//    the ELPA consortium.
-//
-//    More information can be found here:
-//    http://elpa.mpcdf.mpg.de/
-//
-//    ELPA is free software: you can redistribute it and/or modify
-//    it under the terms of the version 3 of the license of the
-//    GNU Lesser General Public License as published by the Free
-//    Software Foundation.
-//
-//    ELPA is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU Lesser General Public License for more details.
-//
-//    You should have received a copy of the GNU Lesser General Public License
-//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-//
-//    ELPA reflects a substantial effort on the part of the original
-//    ELPA consortium, and we ask you to respect the spirit of the
-//    license that we chose: i.e., please contribute any changes you
-//    may have back to the original ELPA library distribution, and keep
-//    any derivatives of ELPA under the same license that we chose for
-//    the original distribution, the GNU Lesser General Public License.
-//
-//
-// --------------------------------------------------------------------------------------------------
-//
-// This file contains the compute intensive kernels for the Householder transformations.
-// It should be compiled with the highest possible optimization level.
-//
-// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
-// On Intel Sandy Bridge use -O3 -mavx
-//
-// Copyright of the original code rests with the authors inside the ELPA
-// consortium. The copyright of any additional modifications shall rest
-// with their original authors, but shall adhere to the licensing terms
-// distributed along with the original code in the file "COPYING".
-//
-// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
-// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
-// --------------------------------------------------------------------------------------------------
-
-#include <complex>
-#include <x86intrin.h>
-
-#define __forceinline __attribute__((always_inline))
-
-#ifdef __USE_AVX128__
-#undef __AVX__
-#endif
-
-#ifdef __FMA4__
-#define __ELPA_USE_FMA__
-#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
-#define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c)
-#endif
-
-#ifdef __AVX2__
-#define __ELPA_USE_FMA__
-#define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c)
-#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
-#endif
-
-extern "C" {
-
-//Forward declaration
-#ifdef __AVX__
-static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
-static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
-static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
-static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
-#else
-static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
-static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
-static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
-static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
-#endif
-
-#if 0
-static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
-{
-	std::complex<double> x1;
-	std::complex<double> x2;
-	std::complex<double> x3;
-	std::complex<double> x4;
-	std::complex<double> y1;
-	std::complex<double> y2;
-	std::complex<double> y3;
-	std::complex<double> y4;
-	std::complex<double> h1;
-	std::complex<double> h2;
-	std::complex<double> tau1;
-	std::complex<double> tau2;
-	int i=0;
-
-	x1 = q[ldq+0];
-	x2 = q[ldq+1];
-	x3 = q[ldq+2];
-	x4 = q[ldq+3];
-
-	h2 = conj(hh[ldh+1]);
-
-	y1 = q[0] + (x1*h2);
-	y2 = q[1] + (x2*h2);
-	y3 = q[2] + (x3*h2);
-	y4 = q[3] + (x4*h2);
-
-	for (i = 2; i < nb; i++)
-	{
-		h1 = conj(hh[i-1]);
-		h2 = conj(hh[ldh+i]);
-
-		x1 += (q[(i*ldq)+0] * h1);
-		y1 += (q[(i*ldq)+0] * h2);
-		x2 += (q[(i*ldq)+1] * h1);
-		y2 += (q[(i*ldq)+1] * h2);
-		x3 += (q[(i*ldq)+2] * h1);
-		y3 += (q[(i*ldq)+2] * h2);
-		x4 += (q[(i*ldq)+3] * h1);
-		y4 += (q[(i*ldq)+3] * h2);
-	}
-	h1 = conj(hh[nb-1]);
-
-	x1 += (q[(nb*ldq)+0] * h1);
-	x2 += (q[(nb*ldq)+1] * h1);
-	x3 += (q[(nb*ldq)+2] * h1);
-	x4 += (q[(nb*ldq)+3] * h1);
-
-	tau1 = hh[0];
-	tau2 = hh[ldh];
-
-	h1 = (-1.0)*tau1;
-
-	x1 *= h1;
-	x2 *= h1;
-	x3 *= h1;
-	x4 *= h1;
-
-	h1 = (-1.0)*tau2;
-	h2 = (-1.0)*tau2;
-	h2 *= s;
-	y1 = y1*h1 +x1*h2;
-	y2 = y2*h1 +x2*h2;
-	y3 = y3*h1 +x3*h2;
-	y4 = y4*h1 +x4*h2;
-
-	q[0] += y1;
-	q[1] += y2;
-	q[2] += y3;
-	q[3] += y4;
-
-	h2 = hh[ldh+1];
-	q[ldq+0] += (x1 + (y1*h2));
-	q[ldq+1] += (x2 + (y2*h2));
-	q[ldq+2] += (x3 + (y3*h2));
-	q[ldq+3] += (x4 + (y4*h2));
-
-	for (i = 2; i < nb; i++)
-	{
-		h1 = hh[i-1];
-		h2 = hh[ldh+i];
-
-		q[(i*ldq)+0] += ((x1*h1) + (y1*h2));
-		q[(i*ldq)+1] += ((x2*h1) + (y2*h2));
-		q[(i*ldq)+2] += ((x3*h1) + (y3*h2));
-		q[(i*ldq)+3] += ((x4*h1) + (y4*h2));
-	}
-
-	h1 = hh[nb-1];
-	q[(nb*ldq)+0] += (x1*h1);
-	q[(nb*ldq)+1] += (x2*h1);
-	q[(nb*ldq)+2] += (x3*h1);
-	q[(nb*ldq)+3] += (x4*h1);
-}
-#endif
-
-void double_hh_trafo_complex_sse_avx_2hv_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	int ldh = *pldh;
-
-	std::complex<double> s = conj(hh[(ldh)+1])*1.0;
-	for (i = 2; i < nb; i++)
-	{
-		s += hh[i-1] * conj(hh[(i+ldh)]);
-	}
-
-#ifdef __AVX__
-#if 1
-	for (i = 0; i < nq-4; i+=8)
-	{
-		hh_trafo_complex_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-	if (nq-i > 0)
-	{
-		hh_trafo_complex_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-#else
-	for (i = 0; i < nq-4; i+=6)
-	{
-		hh_trafo_complex_kernel_6_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-	if (nq-i > 2)
-	{
-		hh_trafo_complex_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-	else if (nq-i > 0)
-	{
-		hh_trafo_complex_kernel_2_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-#endif
-#else
-#if 1
-	for (i = 0; i < nq; i+=4)
-	{
-		hh_trafo_complex_kernel_4_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-#else
-	for (i = 0; i < nq-2; i+=3)
-	{
-		hh_trafo_complex_kernel_3_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-	if (nq-i > 1)
-	{
-		hh_trafo_complex_kernel_2_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-	else if (nq-i > 0)
-	{
-		hh_trafo_complex_kernel_1_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-#endif
-#endif
-}
-
-#ifdef __AVX__
-static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-	double* s_dbl = (double*)(&s);
-
-	__m256d x1, x2, x3, x4;
-	__m256d y1, y2, y3, y4;
-	__m256d q1, q2, q3, q4;
-	__m256d h1_real, h1_imag, h2_real, h2_imag;
-	__m256d tmp1, tmp2, tmp3, tmp4;
-	int i=0;
-
-	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]);
-	x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]);
-	x3 = _mm256_load_pd(&q_dbl[(2*ldq)+8]);
-	x4 = _mm256_load_pd(&q_dbl[(2*ldq)+12]);
-
-	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h2_imag = _mm256_xor_pd(h2_imag, sign);
-#endif
-
-	y1 = _mm256_load_pd(&q_dbl[0]);
-	y2 = _mm256_load_pd(&q_dbl[4]);
-	y3 = _mm256_load_pd(&q_dbl[8]);
-	y4 = _mm256_load_pd(&q_dbl[12]);
-
-	tmp1 = _mm256_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h2_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-	tmp4 = _mm256_mul_pd(h2_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	y4 = _mm256_add_pd(y4, _mm256_FMSUBADD_pd(h2_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-	y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
-		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
-
-		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-		tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-		tmp4 = _mm256_mul_pd(h1_imag, q4);
-#ifdef __ELPA_USE_FMA__
-		x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-		x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-
-		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h2_imag = _mm256_xor_pd(h2_imag, sign);
-#endif
-
-		tmp1 = _mm256_mul_pd(h2_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h2_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h2_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-		tmp4 = _mm256_mul_pd(h2_imag, q4);
-#ifdef __ELPA_USE_FMA__
-		y4 = _mm256_add_pd(y4, _mm256_FMSUBADD_pd(h2_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-		y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-	}
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
-	q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]);
-	q4 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+12]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-	tmp4 = _mm256_mul_pd(h1_imag, q4);
-#ifdef __ELPA_USE_FMA__
-	x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-	x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#else
-	x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#else
-	x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#endif
-	tmp4 = _mm256_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	x4 = _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5));
-#else
-	x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5));
-#endif
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
-	h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
-
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-	h2_real = _mm256_xor_pd(h2_real, sign);
-	h2_imag = _mm256_xor_pd(h2_imag, sign);
-
-	__m128d tmp_s_128 = _mm_loadu_pd(s_dbl);
-	tmp2 = _mm256_broadcast_pd(&tmp_s_128);
-	tmp1 = _mm256_mul_pd(h2_imag, tmp2);
-#ifdef __ELPA_USE_FMA__
-	tmp2 = _mm256_FMADDSUB_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	_mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2));
-	h2_real = _mm256_broadcast_sd(&s_dbl[0]);
-	h2_imag = _mm256_broadcast_sd(&s_dbl[1]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMADDSUB_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, y2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm256_FMADDSUB_pd(h1_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#else
-	y2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, y3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm256_FMADDSUB_pd(h1_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#else
-	y3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#endif
-	tmp4 = _mm256_mul_pd(h1_imag, y4);
-#ifdef __ELPA_USE_FMA__
-	y4 = _mm256_FMADDSUB_pd(h1_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0x5));
-#else
-	y4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5));
-#endif
-
-	tmp1 = _mm256_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_add_pd(y1, _mm256_FMADDSUB_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm256_add_pd(y2, _mm256_FMADDSUB_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h2_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm256_add_pd(y3, _mm256_FMADDSUB_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-	tmp4 = _mm256_mul_pd(h2_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	y4 = _mm256_add_pd(y4, _mm256_FMADDSUB_pd(h2_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-	y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[0]);
-	q2 = _mm256_load_pd(&q_dbl[4]);
-	q3 = _mm256_load_pd(&q_dbl[8]);
-	q4 = _mm256_load_pd(&q_dbl[12]);
-
-	q1 = _mm256_add_pd(q1, y1);
-	q2 = _mm256_add_pd(q2, y2);
-	q3 = _mm256_add_pd(q3, y3);
-	q4 = _mm256_add_pd(q4, y4);
-
-	_mm256_store_pd(&q_dbl[0], q1);
-	_mm256_store_pd(&q_dbl[4], q2);
-	_mm256_store_pd(&q_dbl[8], q3);
-	_mm256_store_pd(&q_dbl[12], q4);
-
-	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
-
-	q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]);
-	q2 = _mm256_load_pd(&q_dbl[(ldq*2)+4]);
-	q3 = _mm256_load_pd(&q_dbl[(ldq*2)+8]);
-	q4 = _mm256_load_pd(&q_dbl[(ldq*2)+12]);
-
-	q1 = _mm256_add_pd(q1, x1);
-	q2 = _mm256_add_pd(q2, x2);
-	q3 = _mm256_add_pd(q3, x3);
-	q4 = _mm256_add_pd(q4, x4);
-
-	tmp1 = _mm256_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h2_imag, y2);
-#ifdef __FMA4_
-	q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h2_imag, y3);
-#ifdef __ELPA_USE_FMA__
-	q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-	tmp4 = _mm256_mul_pd(h2_imag, y4);
-#ifdef __ELPA_USE_FMA__
-	q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h2_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-	q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-
-	_mm256_store_pd(&q_dbl[(ldq*2)+0], q1);
-	_mm256_store_pd(&q_dbl[(ldq*2)+4], q2);
-	_mm256_store_pd(&q_dbl[(ldq*2)+8], q3);
-	_mm256_store_pd(&q_dbl[(ldq*2)+12], q4);
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
-		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
-
-		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-		tmp4 = _mm256_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-		q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-		q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-
-		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
-
-		tmp1 = _mm256_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h2_imag, y2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h2_imag, y3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-		tmp4 = _mm256_mul_pd(h2_imag, y4);
-#ifdef __ELPA_USE_FMA__
-		q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h2_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-		q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4);
-	}
-	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
-
-	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
-	q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]);
-	q4 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+12]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-	tmp4 = _mm256_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-	q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2);
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3);
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+12], q4);
-}
-
-static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-	double* s_dbl = (double*)(&s);
-
-	__m256d x1, x2, x3;
-	__m256d y1, y2, y3;
-	__m256d q1, q2, q3;
-	__m256d h1_real, h1_imag, h2_real, h2_imag;
-	__m256d tmp1, tmp2, tmp3;
-	int i=0;
-
-	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]);
-	x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]);
-	x3 = _mm256_load_pd(&q_dbl[(2*ldq)+8]);
-
-	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h2_imag = _mm256_xor_pd(h2_imag, sign);
-#endif
-
-	y1 = _mm256_load_pd(&q_dbl[0]);
-	y2 = _mm256_load_pd(&q_dbl[4]);
-	y3 = _mm256_load_pd(&q_dbl[8]);
-
-	tmp1 = _mm256_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h2_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
-
-		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-		tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-
-		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h2_imag = _mm256_xor_pd(h2_imag, sign);
-#endif
-
-		tmp1 = _mm256_mul_pd(h2_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h2_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h2_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-	}
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
-	q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#else
-	x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#else
-	x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#endif
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
-	h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
-
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-	h2_real = _mm256_xor_pd(h2_real, sign);
-	h2_imag = _mm256_xor_pd(h2_imag, sign);
-
-	__m128d tmp_s_128 = _mm_loadu_pd(s_dbl);
-	tmp2 = _mm256_broadcast_pd(&tmp_s_128);
-	tmp1 = _mm256_mul_pd(h2_imag, tmp2);
-#ifdef __ELPA_USE_FMA__
-	tmp2 = _mm256_FMADDSUB_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	_mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2));
-	h2_real = _mm256_broadcast_sd(&s_dbl[0]);
-	h2_imag = _mm256_broadcast_sd(&s_dbl[1]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMADDSUB_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, y2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm256_FMADDSUB_pd(h1_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#else
-	y2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, y3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm256_FMADDSUB_pd(h1_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#else
-	y3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#endif
-
-	tmp1 = _mm256_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_add_pd(y1, _mm256_FMADDSUB_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm256_add_pd(y2, _mm256_FMADDSUB_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h2_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm256_add_pd(y3, _mm256_FMADDSUB_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[0]);
-	q2 = _mm256_load_pd(&q_dbl[4]);
-	q3 = _mm256_load_pd(&q_dbl[8]);
-
-	q1 = _mm256_add_pd(q1, y1);
-	q2 = _mm256_add_pd(q2, y2);
-	q3 = _mm256_add_pd(q3, y3);
-
-	_mm256_store_pd(&q_dbl[0], q1);
-	_mm256_store_pd(&q_dbl[4], q2);
-	_mm256_store_pd(&q_dbl[8], q3);
-
-	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
-
-	q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]);
-	q2 = _mm256_load_pd(&q_dbl[(ldq*2)+4]);
-	q3 = _mm256_load_pd(&q_dbl[(ldq*2)+8]);
-
-	q1 = _mm256_add_pd(q1, x1);
-	q2 = _mm256_add_pd(q2, x2);
-	q3 = _mm256_add_pd(q3, x3);
-
-	tmp1 = _mm256_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h2_imag, y2);
-#ifdef __FMA4_
-	q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h2_imag, y3);
-#ifdef __ELPA_USE_FMA__
-	q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-
-	_mm256_store_pd(&q_dbl[(ldq*2)+0], q1);
-	_mm256_store_pd(&q_dbl[(ldq*2)+4], q2);
-	_mm256_store_pd(&q_dbl[(ldq*2)+8], q3);
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
-
-		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-
-		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
-
-		tmp1 = _mm256_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h2_imag, y2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h2_imag, y3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3);
-	}
-	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
-
-	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
-	q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2);
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3);
-}
-
-static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-	double* s_dbl = (double*)(&s);
-
-	__m256d x1, x2;
-	__m256d y1, y2;
-	__m256d q1, q2;
-	__m256d h1_real, h1_imag, h2_real, h2_imag;
-	__m256d tmp1, tmp2;
-	int i=0;
-
-	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]);
-	x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]);
-
-	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h2_imag = _mm256_xor_pd(h2_imag, sign);
-#endif
-
-	y1 = _mm256_load_pd(&q_dbl[0]);
-	y2 = _mm256_load_pd(&q_dbl[4]);
-
-	tmp1 = _mm256_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-
-		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-		tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-
-		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h2_imag = _mm256_xor_pd(h2_imag, sign);
-#endif
-
-		tmp1 = _mm256_mul_pd(h2_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h2_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	}
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#else
-	x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#endif
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
-	h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
-
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-	h2_real = _mm256_xor_pd(h2_real, sign);
-	h2_imag = _mm256_xor_pd(h2_imag, sign);
-
-	__m128d tmp_s_128 = _mm_loadu_pd(s_dbl);
-	tmp2 = _mm256_broadcast_pd(&tmp_s_128);
-	tmp1 = _mm256_mul_pd(h2_imag, tmp2);
-#ifdef __ELPA_USE_FMA__
-	tmp2 = _mm256_FMADDSUB_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	_mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2));
-	h2_real = _mm256_broadcast_sd(&s_dbl[0]);
-	h2_imag = _mm256_broadcast_sd(&s_dbl[1]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMADDSUB_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, y2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm256_FMADDSUB_pd(h1_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#else
-	y2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#endif
-
-	tmp1 = _mm256_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_add_pd(y1, _mm256_FMADDSUB_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm256_add_pd(y2, _mm256_FMADDSUB_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[0]);
-	q2 = _mm256_load_pd(&q_dbl[4]);
-
-	q1 = _mm256_add_pd(q1, y1);
-	q2 = _mm256_add_pd(q2, y2);
-
-	_mm256_store_pd(&q_dbl[0], q1);
-	_mm256_store_pd(&q_dbl[4], q2);
-
-	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
-
-	q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]);
-	q2 = _mm256_load_pd(&q_dbl[(ldq*2)+4]);
-
-	q1 = _mm256_add_pd(q1, x1);
-	q2 = _mm256_add_pd(q2, x2);
-
-	tmp1 = _mm256_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h2_imag, y2);
-#ifdef __FMA4_
-	q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-
-	_mm256_store_pd(&q_dbl[(ldq*2)+0], q1);
-	_mm256_store_pd(&q_dbl[(ldq*2)+4], q2);
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-
-		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-
-		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
-
-		tmp1 = _mm256_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h2_imag, y2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
-	}
-	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
-
-	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2);
-}
-
-static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-	double* s_dbl = (double*)(&s);
-
-	__m256d x1;
-	__m256d y1;
-	__m256d q1;
-	__m256d h1_real, h1_imag, h2_real, h2_imag;
-	__m256d tmp1;
-	int i=0;
-
-	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]);
-
-	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h2_imag = _mm256_xor_pd(h2_imag, sign);
-#endif
-
-	y1 = _mm256_load_pd(&q_dbl[0]);
-
-	tmp1 = _mm256_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-
-		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-		tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-
-		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h2_imag = _mm256_xor_pd(h2_imag, sign);
-#endif
-
-		tmp1 = _mm256_mul_pd(h2_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	}
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
-	h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
-
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-	h2_real = _mm256_xor_pd(h2_real, sign);
-	h2_imag = _mm256_xor_pd(h2_imag, sign);
-
-	__m128d tmp_s_128 = _mm_loadu_pd(s_dbl);
-	__m256d tmp2 = _mm256_broadcast_pd(&tmp_s_128);
-	tmp1 = _mm256_mul_pd(h2_imag, tmp2);
-#ifdef __ELPA_USE_FMA__
-	tmp2 = _mm256_FMADDSUB_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	_mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2));
-	h2_real = _mm256_broadcast_sd(&s_dbl[0]);
-	h2_imag = _mm256_broadcast_sd(&s_dbl[1]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMADDSUB_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-
-	tmp1 = _mm256_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_add_pd(y1, _mm256_FMADDSUB_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[0]);
-
-	q1 = _mm256_add_pd(q1, y1);
-
-	_mm256_store_pd(&q_dbl[0], q1);
-
-	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
-
-	q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]);
-
-	q1 = _mm256_add_pd(q1, x1);
-
-	tmp1 = _mm256_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-
-	_mm256_store_pd(&q_dbl[(ldq*2)+0], q1);
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-
-		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-
-		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
-
-		tmp1 = _mm256_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-	}
-	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
-
-	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
-}
-#else
-static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-	double* s_dbl = (double*)(&s);
-
-	__m128d x1, x2, x3, x4;
-	__m128d y1, y2, y3, y4;
-	__m128d q1, q2, q3, q4;
-	__m128d h1_real, h1_imag, h2_real, h2_imag;
-	__m128d tmp1, tmp2, tmp3, tmp4;
-	int i=0;
-
-	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]);
-	x2 = _mm_load_pd(&q_dbl[(2*ldq)+2]);
-	x3 = _mm_load_pd(&q_dbl[(2*ldq)+4]);
-	x4 = _mm_load_pd(&q_dbl[(2*ldq)+6]);
-
-	h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h2_imag = _mm_xor_pd(h2_imag, sign);
-#endif
-
-	y1 = _mm_load_pd(&q_dbl[0]);
-	y2 = _mm_load_pd(&q_dbl[2]);
-	y3 = _mm_load_pd(&q_dbl[4]);
-	y4 = _mm_load_pd(&q_dbl[6]);
-
-	tmp1 = _mm_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h2_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp4 = _mm_mul_pd(h2_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-	y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
-
-		h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-		tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp4 = _mm_mul_pd(h1_imag, q4);
-#ifdef __ELPA_USE_FMA__
-		x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-		x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-
-		h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h2_imag = _mm_xor_pd(h2_imag, sign);
-#endif
-
-		tmp1 = _mm_mul_pd(h2_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h2_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h2_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp4 = _mm_mul_pd(h2_imag, q4);
-#ifdef __ELPA_USE_FMA__
-		y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-		y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]);
-	q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]);
-	q4 = _mm_load_pd(&q_dbl[(2*nb*ldq)+6]);
-
-	tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp4 = _mm_mul_pd(h1_imag, q4);
-#ifdef __ELPA_USE_FMA__
-	x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-	x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[0]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#else
-	x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#else
-	x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp4 = _mm_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
-#else
-	x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
-#endif
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
-	h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
-
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-	h2_real = _mm_xor_pd(h2_real, sign);
-	h2_imag = _mm_xor_pd(h2_imag, sign);
-
-	tmp2 = _mm_loadu_pd(s_dbl);
-	tmp1 = _mm_mul_pd(h2_imag, tmp2);
-#ifdef __ELPA_USE_FMA__
-	tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	_mm_storeu_pd(s_dbl, tmp2);
-	h2_real = _mm_loaddup_pd(&s_dbl[0]);
-	h2_imag = _mm_loaddup_pd(&s_dbl[1]);
-
-	tmp1 = _mm_mul_pd(h1_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, y2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm_maddsub_pd(h1_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#else
-	y2 = _mm_addsub_pd( _mm_mul_pd(h1_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, y3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm_maddsub_pd(h1_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#else
-	y3 = _mm_addsub_pd( _mm_mul_pd(h1_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp4 = _mm_mul_pd(h1_imag, y4);
-#ifdef __ELPA_USE_FMA__
-	y4 = _mm_maddsub_pd(h1_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
-#else
-	y4 = _mm_addsub_pd( _mm_mul_pd(h1_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
-#endif
-
-	tmp1 = _mm_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm_add_pd(y2, _mm_maddsub_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h2_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm_add_pd(y3, _mm_maddsub_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp4 = _mm_mul_pd(h2_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	y4 = _mm_add_pd(y4, _mm_maddsub_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-	y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[0]);
-	q2 = _mm_load_pd(&q_dbl[2]);
-	q3 = _mm_load_pd(&q_dbl[4]);
-	q4 = _mm_load_pd(&q_dbl[6]);
-
-	q1 = _mm_add_pd(q1, y1);
-	q2 = _mm_add_pd(q2, y2);
-	q3 = _mm_add_pd(q3, y3);
-	q4 = _mm_add_pd(q4, y4);
-
-	_mm_store_pd(&q_dbl[0], q1);
-	_mm_store_pd(&q_dbl[2], q2);
-	_mm_store_pd(&q_dbl[4], q3);
-	_mm_store_pd(&q_dbl[6], q4);
-
-	h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
-
-	q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]);
-	q2 = _mm_load_pd(&q_dbl[(ldq*2)+2]);
-	q3 = _mm_load_pd(&q_dbl[(ldq*2)+4]);
-	q4 = _mm_load_pd(&q_dbl[(ldq*2)+6]);
-
-	q1 = _mm_add_pd(q1, x1);
-	q2 = _mm_add_pd(q2, x2);
-	q3 = _mm_add_pd(q3, x3);
-	q4 = _mm_add_pd(q4, x4);
-
-	tmp1 = _mm_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h2_imag, y2);
-#ifdef __ELPA_USE_FMA__
-	q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h2_imag, y3);
-#ifdef __ELPA_USE_FMA__
-	q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp4 = _mm_mul_pd(h2_imag, y4);
-#ifdef __ELPA_USE_FMA__
-	q4 = _mm_add_pd(q4, _mm_maddsub_pd(h2_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-	q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h2_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-
-	_mm_store_pd(&q_dbl[(ldq*2)+0], q1);
-	_mm_store_pd(&q_dbl[(ldq*2)+2], q2);
-	_mm_store_pd(&q_dbl[(ldq*2)+4], q3);
-	_mm_store_pd(&q_dbl[(ldq*2)+6], q4);
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
-
-		h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
-
-		tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp4 = _mm_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-		q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-		q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-
-		h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
-
-		tmp1 = _mm_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h2_imag, y2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h2_imag, y3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp4 = _mm_mul_pd(h2_imag, y4);
-#ifdef __ELPA_USE_FMA__
-		q4 = _mm_add_pd(q4, _mm_maddsub_pd(h2_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-		q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h2_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-
-		_mm_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+2], q2);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+4], q3);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+6], q4);
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
-
-	q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]);
-	q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]);
-	q4 = _mm_load_pd(&q_dbl[(2*nb*ldq)+6]);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp4 = _mm_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-	q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2);
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+4], q3);
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+6], q4);
-}
-
-static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-	double* s_dbl = (double*)(&s);
-
-	__m128d x1, x2, x3;
-	__m128d y1, y2, y3;
-	__m128d q1, q2, q3;
-	__m128d h1_real, h1_imag, h2_real, h2_imag;
-	__m128d tmp1, tmp2, tmp3;
-	int i=0;
-
-	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]);
-	x2 = _mm_load_pd(&q_dbl[(2*ldq)+2]);
-	x3 = _mm_load_pd(&q_dbl[(2*ldq)+4]);
-
-	h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h2_imag = _mm_xor_pd(h2_imag, sign);
-#endif
-
-	y1 = _mm_load_pd(&q_dbl[0]);
-	y2 = _mm_load_pd(&q_dbl[2]);
-	y3 = _mm_load_pd(&q_dbl[4]);
-
-	tmp1 = _mm_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h2_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
-
-		h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-		tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-
-		h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h2_imag = _mm_xor_pd(h2_imag, sign);
-#endif
-
-		tmp1 = _mm_mul_pd(h2_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h2_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h2_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]);
-	q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]);
-
-	tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[0]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#else
-	x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#else
-	x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#endif
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
-	h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
-
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-	h2_real = _mm_xor_pd(h2_real, sign);
-	h2_imag = _mm_xor_pd(h2_imag, sign);
-
-	tmp2 = _mm_loadu_pd(s_dbl);
-	tmp1 = _mm_mul_pd(h2_imag, tmp2);
-#ifdef __ELPA_USE_FMA__
-	tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	_mm_storeu_pd(s_dbl, tmp2);
-	h2_real = _mm_loaddup_pd(&s_dbl[0]);
-	h2_imag = _mm_loaddup_pd(&s_dbl[1]);
-
-	tmp1 = _mm_mul_pd(h1_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, y2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm_maddsub_pd(h1_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#else
-	y2 = _mm_addsub_pd( _mm_mul_pd(h1_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, y3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm_maddsub_pd(h1_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#else
-	y3 = _mm_addsub_pd( _mm_mul_pd(h1_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#endif
-
-	tmp1 = _mm_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm_add_pd(y2, _mm_maddsub_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h2_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm_add_pd(y3, _mm_maddsub_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[0]);
-	q2 = _mm_load_pd(&q_dbl[2]);
-	q3 = _mm_load_pd(&q_dbl[4]);
-
-	q1 = _mm_add_pd(q1, y1);
-	q2 = _mm_add_pd(q2, y2);
-	q3 = _mm_add_pd(q3, y3);
-
-	_mm_store_pd(&q_dbl[0], q1);
-	_mm_store_pd(&q_dbl[2], q2);
-	_mm_store_pd(&q_dbl[4], q3);
-
-	h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
-
-	q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]);
-	q2 = _mm_load_pd(&q_dbl[(ldq*2)+2]);
-	q3 = _mm_load_pd(&q_dbl[(ldq*2)+4]);
-
-	q1 = _mm_add_pd(q1, x1);
-	q2 = _mm_add_pd(q2, x2);
-	q3 = _mm_add_pd(q3, x3);
-
-	tmp1 = _mm_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h2_imag, y2);
-#ifdef __ELPA_USE_FMA__
-	q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h2_imag, y3);
-#ifdef __ELPA_USE_FMA__
-	q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-
-	_mm_store_pd(&q_dbl[(ldq*2)+0], q1);
-	_mm_store_pd(&q_dbl[(ldq*2)+2], q2);
-	_mm_store_pd(&q_dbl[(ldq*2)+4], q3);
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
-
-		h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
-
-		tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-
-		h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
-
-		tmp1 = _mm_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h2_imag, y2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h2_imag, y3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-
-		_mm_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+2], q2);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+4], q3);
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
-
-	q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]);
-	q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2);
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+4], q3);
-}
-
-static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-	double* s_dbl = (double*)(&s);
-
-	__m128d x1, x2;
-	__m128d y1, y2;
-	__m128d q1, q2;
-	__m128d h1_real, h1_imag, h2_real, h2_imag;
-	__m128d tmp1, tmp2;
-	int i=0;
-
-	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]);
-	x2 = _mm_load_pd(&q_dbl[(2*ldq)+2]);
-
-	h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h2_imag = _mm_xor_pd(h2_imag, sign);
-#endif
-
-	y1 = _mm_load_pd(&q_dbl[0]);
-	y2 = _mm_load_pd(&q_dbl[2]);
-
-	tmp1 = _mm_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-
-		h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-		tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-
-		h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h2_imag = _mm_xor_pd(h2_imag, sign);
-#endif
-
-		tmp1 = _mm_mul_pd(h2_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h2_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]);
-
-	tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[0]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#else
-	x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#endif
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
-	h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
-
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-	h2_real = _mm_xor_pd(h2_real, sign);
-	h2_imag = _mm_xor_pd(h2_imag, sign);
-
-	tmp2 = _mm_loadu_pd(s_dbl);
-	tmp1 = _mm_mul_pd(h2_imag, tmp2);
-#ifdef __ELPA_USE_FMA__
-	tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	_mm_storeu_pd(s_dbl, tmp2);
-	h2_real = _mm_loaddup_pd(&s_dbl[0]);
-	h2_imag = _mm_loaddup_pd(&s_dbl[1]);
-
-	tmp1 = _mm_mul_pd(h1_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, y2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm_maddsub_pd(h1_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#else
-	y2 = _mm_addsub_pd( _mm_mul_pd(h1_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#endif
-
-	tmp1 = _mm_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm_add_pd(y2, _mm_maddsub_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[0]);
-	q2 = _mm_load_pd(&q_dbl[2]);
-
-	q1 = _mm_add_pd(q1, y1);
-	q2 = _mm_add_pd(q2, y2);
-
-	_mm_store_pd(&q_dbl[0], q1);
-	_mm_store_pd(&q_dbl[2], q2);
-
-	h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
-
-	q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]);
-	q2 = _mm_load_pd(&q_dbl[(ldq*2)+2]);
-
-	q1 = _mm_add_pd(q1, x1);
-	q2 = _mm_add_pd(q2, x2);
-
-	tmp1 = _mm_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h2_imag, y2);
-#ifdef __ELPA_USE_FMA__
-	q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-
-	_mm_store_pd(&q_dbl[(ldq*2)+0], q1);
-	_mm_store_pd(&q_dbl[(ldq*2)+2], q2);
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-
-		h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
-
-		tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-
-		h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
-
-		tmp1 = _mm_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h2_imag, y2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-
-		_mm_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+2], q2);
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
-
-	q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2);
-}
-
-static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-	double* s_dbl = (double*)(&s);
-
-	__m128d x1;
-	__m128d y1;
-	__m128d q1;
-	__m128d h1_real, h1_imag, h2_real, h2_imag;
-	__m128d tmp1;
-	int i=0;
-
-	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]);
-
-	h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h2_imag = _mm_xor_pd(h2_imag, sign);
-#endif
-
-	y1 = _mm_load_pd(&q_dbl[0]);
-
-	tmp1 = _mm_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-
-		h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-		tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-
-		h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h2_imag = _mm_xor_pd(h2_imag, sign);
-#endif
-
-		tmp1 = _mm_mul_pd(h2_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]);
-
-	tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[0]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
-	h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
-
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-	h2_real = _mm_xor_pd(h2_real, sign);
-	h2_imag = _mm_xor_pd(h2_imag, sign);
-
-	__m128d tmp2 = _mm_loadu_pd(s_dbl);
-	tmp1 = _mm_mul_pd(h2_imag, tmp2);
-#ifdef __ELPA_USE_FMA__
-	tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	_mm_storeu_pd(s_dbl, tmp2);
-	h2_real = _mm_loaddup_pd(&s_dbl[0]);
-	h2_imag = _mm_loaddup_pd(&s_dbl[1]);
-
-	tmp1 = _mm_mul_pd(h1_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-
-	tmp1 = _mm_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[0]);
-
-	q1 = _mm_add_pd(q1, y1);
-
-	_mm_store_pd(&q_dbl[0], q1);
-
-	h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
-
-	q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]);
-
-	q1 = _mm_add_pd(q1, x1);
-
-	tmp1 = _mm_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-
-	_mm_store_pd(&q_dbl[(ldq*2)+0], q1);
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-
-		h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
-
-		tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-
-		h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
-
-		tmp1 = _mm_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-
-		_mm_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
-
-	q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
-}
-#endif
-} // extern C
diff --git a/src/elpa2_kernels/elpa2_kernels_real.F90 b/src/elpa2_kernels/elpa2_kernels_real.F90
deleted file mode 100644
index 66cf86067..000000000
--- a/src/elpa2_kernels/elpa2_kernels_real.F90
+++ /dev/null
@@ -1,662 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! --------------------------------------------------------------------------------------------------
-!
-! This file contains the compute intensive kernels for the Householder transformations.
-! It should be compiled with the highest possible optimization level.
-!
-! On Intel use -O3 -xSSE4.2 (or the SSE level fitting to your CPU)
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-! --------------------------------------------------------------------------------------------------
-#include "config-f90.h"
-
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-#define PACK_REAL_TO_COMPLEX
-#else
-#undef PACK_REAL_TO_COMPLEX
-#endif
-
-#ifndef DESPERATELY_WANT_ASSUMED_SIZE
-module real_generic_kernel
-
-  private
-  public double_hh_trafo_generic
-contains
-#endif
-
-
-  subroutine double_hh_trafo_generic(q, hh, nb, nq, ldq, ldh)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    use iso_c_binding
-    implicit none
-
-    integer(kind=ik), intent(in)      :: nb, nq, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    real(kind=rk), intent(inout)      :: q(ldq,*)
-    real(kind=rk), intent(in)         :: hh(ldh,*)
-#else
-    real(kind=rk), intent(inout)      :: q(1:ldq,1:nb+1)
-    real(kind=rk), intent(in)         :: hh(1:ldh,1:6)
-#endif
-
-    real(kind=rk)                     :: s
-    integer(kind=ik)                  :: i
-
-!    equivalence(q(1,1),q_complex(1,1))
-
-    ! Safety only:
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%start("kernel generic: double_hh_trafo_generic")
-#endif
-    if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
-
-    ! Calculate dot product of the two Householder vectors
-
-    s = hh(2,2)*1
-    do i=3,nb
-       s = s+hh(i,2)*hh(i-1,1)
-    enddo
-
-    ! Do the Householder transformations
-
-#ifndef DESPERATELY_WANT_ASSUMED_SIZE
-!    ! assign real data to compplex pointer
-!    call c_f_pointer(c_loc(q), q_complex, [size(q,dim=1)/2,size(q,dim=2)])
-#endif
-    ! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
-
-    do i=1,nq-8,12
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-       call hh_trafo_kernel_12_generic(q(i,1),hh, nb, ldq, ldh, s)
-#else
-       call hh_trafo_kernel_12_generic(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
-#endif
-    enddo
-
-    ! i > nq-8 now, i.e. at most 8 rows remain
-
-    if(nq-i+1 > 4) then
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    print *,"calling 8"
-       call hh_trafo_kernel_8_generic(q(i,1),hh, nb, ldq, ldh, s)
-#else
-       call hh_trafo_kernel_8_generic(q(i:ldq,1:nb+1), hh(1:ldh,1:2), nb, ldq, ldh, s)
-#endif
-
-    else if(nq-i+1 > 0) then
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-       print *,"calling 4"
-       call hh_trafo_kernel_4_generic(q(i,1),hh, nb, ldq, ldh, s)
-#else
-       call hh_trafo_kernel_4_generic(q(i:ldq,1:+nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
-#endif
-
-    endif
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%stop("kernel generic: double_hh_trafo_generic")
-#endif
-  end subroutine double_hh_trafo_generic
-
-  ! --------------------------------------------------------------------------------------------------
-  ! The following kernels perform the Householder transformation on Q for 12/8/4 rows.
-  ! Please note that Q is declared complex*16 here.
-  ! This is a hint for compilers that packed arithmetic can be used for Q
-  ! (relevant for Intel SSE and BlueGene double hummer CPUs).
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_12_generic(q, hh, nb, ldq, ldh, s)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq/2,*)
-    real(kind=rk), intent(in)       :: hh(ldh,*)
-#else
-    real(kind=rk), intent(inout)    :: q(:,:)
-    real(kind=rk), intent(in)       :: hh(ldh,2)
-#endif
-    real(kind=rk), intent(in)       :: s
-
-#ifdef PACK_REAL_TO_COMPLEX
-    complex(kind=ck)                :: x1, x2, x3, x4, x5, x6, y1, y2, y3, y4, y5, y6
-#else
-    real(kind=rk)                   :: x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, &
-                                       y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12
-#endif
-    real(kind=rk)                   :: h1, h2, tau1, tau2
-    integer(kind=ik)                :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%start("kernel generic: hh_trafo_kernel_12_generic")
-#endif
-    x1  = q(1,2)
-    x2  = q(2,2)
-    x3  = q(3,2)
-    x4  = q(4,2)
-    x5  = q(5,2)
-    x6  = q(6,2)
-#ifndef PACK_REAL_TO_COMPLEX
-    x7  = q(7,2)
-    x8  = q(8,2)
-    x9  = q(9,2)
-    x10 = q(10,2)
-    x11 = q(11,2)
-    x12 = q(12,2)
-#endif
-
-    y1  = q(1 ,1) + q(1, 2)*hh(2,2)
-    y2  = q(2 ,1) + q(2, 2)*hh(2,2)
-    y3  = q(3 ,1) + q(3, 2)*hh(2,2)
-    y4  = q(4 ,1) + q(4, 2)*hh(2,2)
-    y5  = q(5 ,1) + q(5, 2)*hh(2,2)
-    y6  = q(6 ,1) + q(6, 2)*hh(2,2)
-#ifndef PACK_REAL_TO_COMPLEX
-    y7  = q(7 ,1) + q(7, 2)*hh(2,2)
-    y8  = q(8 ,1) + q(8, 2)*hh(2,2)
-    y9  = q(9 ,1) + q(9, 2)*hh(2,2)
-    y10 = q(10,1) + q(10,2)*hh(2,2)
-    y11 = q(11,1) + q(11,2)*hh(2,2)
-    y12 = q(12,1) + q(12,2)*hh(2,2)
-#endif
-
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1  = hh(i-1,1)
-       h2  = hh(i,2)
-       x1  = x1 + q(1, i)*h1
-       y1  = y1 + q(1, i)*h2
-       x2  = x2 + q(2, i)*h1
-       y2  = y2 + q(2, i)*h2
-       x3  = x3 + q(3, i)*h1
-       y3  = y3 + q(3, i)*h2
-       x4  = x4 + q(4, i)*h1
-       y4  = y4 + q(4, i)*h2
-       x5  = x5 + q(5, i)*h1
-       y5  = y5 + q(5, i)*h2
-       x6  = x6 + q(6, i)*h1
-       y6  = y6 + q(6, i)*h2
-#ifndef PACK_REAL_TO_COMPLEX
-       x7  = x7  + q(7, i)*h1
-       y7  = y7  + q(7, i)*h2
-       x8  = x8  + q(8, i)*h1
-       y8  = y8  + q(8, i)*h2
-       x9  = x9  + q(9, i)*h1
-       y9  = y9  + q(9, i)*h2
-       x10 = x10 + q(10,i)*h1
-       y10 = y10 + q(10,i)*h2
-       x11 = x11 + q(11,i)*h1
-       y11 = y11 + q(11,i)*h2
-       x12 = x12 + q(12,i)*h1
-       y12 = y12 + q(12,i)*h2
-#endif
-    enddo
-
-    x1  = x1  + q(1,nb+1)*hh(nb,1)
-    x2  = x2  + q(2,nb+1)*hh(nb,1)
-    x3  = x3  + q(3,nb+1)*hh(nb,1)
-    x4  = x4  + q(4,nb+1)*hh(nb,1)
-    x5  = x5  + q(5,nb+1)*hh(nb,1)
-    x6  = x6  + q(6,nb+1)*hh(nb,1)
-#ifndef PACK_REAL_TO_COMPLEX
-    x7  = x7  + q(7, nb+1)*hh(nb,1)
-    x8  = x8  + q(8, nb+1)*hh(nb,1)
-    x9  = x9  + q(9, nb+1)*hh(nb,1)
-    x10 = x10 + q(10,nb+1)*hh(nb,1)
-    x11 = x11 + q(11,nb+1)*hh(nb,1)
-    x12 = x12 + q(12,nb+1)*hh(nb,1)
-
-#endif
-
-    tau1 = hh(1,1)
-    tau2 = hh(1,2)
-
-    h1  = -tau1
-    x1  = x1 *h1
-    x2  = x2 *h1
-    x3  = x3 *h1
-    x4  = x4 *h1
-    x5  = x5 *h1
-    x6  = x6 *h1
-#ifndef PACK_REAL_TO_COMPLEX
-    x7  = x7 *h1
-    x8  = x8 *h1
-    x9  = x9 *h1
-    x10 = x10*h1
-    x11 = x11*h1
-    x12 = x12*h1
-#endif
-
-    h1  = -tau2
-    h2  = -tau2*s
-    y1  = y1 *h1 + x1 *h2
-    y2  = y2 *h1 + x2 *h2
-    y3  = y3 *h1 + x3 *h2
-    y4  = y4 *h1 + x4 *h2
-    y5  = y5 *h1 + x5 *h2
-    y6  = y6 *h1 + x6 *h2
-#ifndef PACK_REAL_TO_COMPLEX
-    y7  = y7 *h1 + x7 *h2
-    y8  = y8 *h1 + x8 *h2
-    y9  = y9 *h1 + x9 *h2
-    y10 = y10*h1 + x10*h2
-    y11 = y11*h1 + x11*h2
-    y12 = y12*h1 + x12*h2
-#endif
-    q(1,1)  = q(1, 1) + y1
-    q(2,1)  = q(2, 1) + y2
-    q(3,1)  = q(3, 1) + y3
-    q(4,1)  = q(4, 1) + y4
-    q(5,1)  = q(5, 1) + y5
-    q(6,1)  = q(6, 1) + y6
-#ifndef PACK_REAL_TO_COMPLEX
-    q(7 ,1) = q(7, 1) + y7
-    q(8 ,1) = q(8, 1) + y8
-    q(9 ,1) = q(9, 1) + y9
-    q(10,1) = q(10,1) + y10
-    q(11,1) = q(11,1) + y11
-    q(12,1) = q(12,1) + y12
-#endif
-
-    q(1, 2) = q(1, 2) + x1  + y1 *hh(2,2)
-    q(2, 2) = q(2, 2) + x2  + y2 *hh(2,2)
-    q(3, 2) = q(3, 2) + x3  + y3 *hh(2,2)
-    q(4, 2) = q(4, 2) + x4  + y4 *hh(2,2)
-    q(5, 2) = q(5, 2) + x5  + y5 *hh(2,2)
-    q(6, 2) = q(6, 2) + x6  + y6 *hh(2,2)
-#ifndef PACK_REAL_TO_COMPLEX
-    q(7, 2) = q(7, 2) + x7  + y7 *hh(2,2)
-    q(8, 2) = q(8, 2) + x8  + y8 *hh(2,2)
-    q(9, 2) = q(9, 2) + x9  + y9 *hh(2,2)
-    q(10,2) = q(10,2) + x10 + y10*hh(2,2)
-    q(11,2) = q(11,2) + x11 + y11*hh(2,2)
-    q(12,2) = q(12,2) + x12 + y12*hh(2,2)
-#endif
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       q(1, i) = q(1,i)  + x1 *h1 + y1 *h2
-       q(2, i) = q(2,i)  + x2 *h1 + y2 *h2
-       q(3, i) = q(3,i)  + x3 *h1 + y3 *h2
-       q(4, i) = q(4,i)  + x4 *h1 + y4 *h2
-       q(5, i) = q(5,i)  + x5 *h1 + y5 *h2
-       q(6, i) = q(6,i)  + x6 *h1 + y6 *h2
-#ifndef PACK_REAL_TO_COMPLEX
-       q(7, i) = q(7, i) + x7 *h1 + y7 *h2
-       q(8, i) = q(8, i) + x8 *h1 + y8 *h2
-       q(9, i) = q(9, i) + x9 *h1 + y9 *h2
-       q(10,i) = q(10,i) + x10*h1 + y10*h2
-       q(11,i) = q(11,i) + x11*h1 + y11*h2
-       q(12,i) = q(12,i) + x12*h1 + y12*h2
-#endif
-    enddo
-
-    q(1, nb+1) = q(1, nb+1) + x1 *hh(nb,1)
-    q(2, nb+1) = q(2, nb+1) + x2 *hh(nb,1)
-    q(3, nb+1) = q(3, nb+1) + x3 *hh(nb,1)
-    q(4, nb+1) = q(4, nb+1) + x4 *hh(nb,1)
-    q(5, nb+1) = q(5, nb+1) + x5 *hh(nb,1)
-    q(6, nb+1) = q(6, nb+1) + x6 *hh(nb,1)
-#ifndef PACK_REAL_TO_COMPLEX
-    q(7, nb+1) = q(7, nb+1) + x7 *hh(nb,1)
-    q(8, nb+1) = q(8, nb+1) + x8 *hh(nb,1)
-    q(9, nb+1) = q(9, nb+1) + x9 *hh(nb,1)
-    q(10,nb+1) = q(10,nb+1) + x10*hh(nb,1)
-    q(11,nb+1) = q(11,nb+1) + x11*hh(nb,1)
-    q(12,nb+1) = q(12,nb+1) + x12*hh(nb,1)
-#endif
-
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%stop("kernel generic: hh_trafo_kernel_12_generic")
-#endif
-  end subroutine hh_trafo_kernel_12_generic
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_8_generic(q, hh, nb, ldq, ldh, s)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)     :: nb, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout)  :: q(ldq/2,*)
-    real(kind=rk), intent(in)        :: hh(ldh,*)
-#else
-    real(kind=rk),   intent(inout)   :: q(:,:)
-    real(kind=rk), intent(in)        :: hh(ldh,2)
-#endif
-    real(kind=rk), intent(in)        :: s
-#ifdef PACK_REAL_TO_COMPLEX
-    complex(kind=ck)                 :: x1, x2, x3, x4, y1, y2, y3, y4
-#else
-    real(kind=rk)                    :: x1, x2, x3, x4, x5, x6, x7, x8, &
-                                        y1, y2, y3, y4, y5, y6, y7, y8
-#endif
-    real(kind=rk)                    :: h1, h2, tau1, tau2
-    integer(kind=ik)                 :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%start("kernel generic: hh_trafo_kernel_8_generic")
-#endif
-    x1 = q(1,2)
-    x2 = q(2,2)
-    x3 = q(3,2)
-    x4 = q(4,2)
-#ifndef PACK_REAL_TO_COMPLEX
-    x5 = q(5,2)
-    x6 = q(6,2)
-    x7 = q(7,2)
-    x8 = q(8,2)
-#endif
-
-    y1 = q(1,1) + q(1,2)*hh(2,2)
-    y2 = q(2,1) + q(2,2)*hh(2,2)
-    y3 = q(3,1) + q(3,2)*hh(2,2)
-    y4 = q(4,1) + q(4,2)*hh(2,2)
-#ifndef PACK_REAL_TO_COMPLEX
-    y5 = q(5,1) + q(5,2)*hh(2,2)
-    y6 = q(6,1) + q(6,2)*hh(2,2)
-    y7 = q(7,1) + q(7,2)*hh(2,2)
-    y8 = q(8,1) + q(8,2)*hh(2,2)
-#endif
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       x1 = x1 + q(1,i)*h1
-       y1 = y1 + q(1,i)*h2
-       x2 = x2 + q(2,i)*h1
-       y2 = y2 + q(2,i)*h2
-       x3 = x3 + q(3,i)*h1
-       y3 = y3 + q(3,i)*h2
-       x4 = x4 + q(4,i)*h1
-       y4 = y4 + q(4,i)*h2
-#ifndef PACK_REAL_TO_COMPLEX
-       x5 = x5 + q(5,i)*h1
-       y5 = y5 + q(5,i)*h2
-       x6 = x6 + q(6,i)*h1
-       y6 = y6 + q(6,i)*h2
-       x7 = x7 + q(7,i)*h1
-       y7 = y7 + q(7,i)*h2
-       x8 = x8 + q(8,i)*h1
-       y8 = y8 + q(8,i)*h2
-#endif
-    enddo
-
-    x1 = x1 + q(1,nb+1)*hh(nb,1)
-    x2 = x2 + q(2,nb+1)*hh(nb,1)
-    x3 = x3 + q(3,nb+1)*hh(nb,1)
-    x4 = x4 + q(4,nb+1)*hh(nb,1)
-#ifndef PACK_REAL_TO_COMPLEX
-    x5 = x5 + q(5,nb+1)*hh(nb,1)
-    x6 = x6 + q(6,nb+1)*hh(nb,1)
-    x7 = x7 + q(7,nb+1)*hh(nb,1)
-    x8 = x8 + q(8,nb+1)*hh(nb,1)
-#endif
-
-    tau1 = hh(1,1)
-    tau2 = hh(1,2)
-
-    h1 = -tau1
-    x1 = x1*h1
-    x2 = x2*h1
-    x3 = x3*h1
-    x4 = x4*h1
-#ifndef PACK_REAL_TO_COMPLEX
-    x5 = x5*h1
-    x6 = x6*h1
-    x7 = x7*h1
-    x8 = x8*h1
-#endif
-    h1 = -tau2
-    h2 = -tau2*s
-    y1 = y1*h1 + x1*h2
-    y2 = y2*h1 + x2*h2
-    y3 = y3*h1 + x3*h2
-    y4 = y4*h1 + x4*h2
-#ifndef PACK_REAL_TO_COMPLEX
-    y5 = y5*h1 + x5*h2
-    y6 = y6*h1 + x6*h2
-    y7 = y7*h1 + x7*h2
-    y8 = y8*h1 + x8*h2
-#endif
-    q(1,1) = q(1,1) + y1
-    q(2,1) = q(2,1) + y2
-    q(3,1) = q(3,1) + y3
-    q(4,1) = q(4,1) + y4
-#ifndef PACK_REAL_TO_COMPLEX
-    q(5,1) = q(5,1) + y5
-    q(6,1) = q(6,1) + y6
-    q(7,1) = q(7,1) + y7
-    q(8,1) = q(8,1) + y8
-#endif
-    q(1,2) = q(1,2) + x1 + y1*hh(2,2)
-    q(2,2) = q(2,2) + x2 + y2*hh(2,2)
-    q(3,2) = q(3,2) + x3 + y3*hh(2,2)
-    q(4,2) = q(4,2) + x4 + y4*hh(2,2)
-#ifndef PACK_REAL_TO_COMPLEX
-    q(5,2) = q(5,2) + x5 + y5*hh(2,2)
-    q(6,2) = q(6,2) + x6 + y6*hh(2,2)
-    q(7,2) = q(7,2) + x7 + y7*hh(2,2)
-    q(8,2) = q(8,2) + x8 + y8*hh(2,2)
-#endif
-
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       q(1,i) = q(1,i) + x1*h1 + y1*h2
-       q(2,i) = q(2,i) + x2*h1 + y2*h2
-       q(3,i) = q(3,i) + x3*h1 + y3*h2
-       q(4,i) = q(4,i) + x4*h1 + y4*h2
-#ifndef PACK_REAL_TO_COMPLEX
-       q(5,i) = q(5,i) + x5*h1 + y5*h2
-       q(6,i) = q(6,i) + x6*h1 + y6*h2
-       q(7,i) = q(7,i) + x7*h1 + y7*h2
-       q(8,i) = q(8,i) + x8*h1 + y8*h2
-#endif
-    enddo
-
-    q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
-    q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
-    q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
-    q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
-#ifndef PACK_REAL_TO_COMPLEX
-    q(5,nb+1) = q(5,nb+1) + x5*hh(nb,1)
-    q(6,nb+1) = q(6,nb+1) + x6*hh(nb,1)
-    q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1)
-    q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1)
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%stop("kernel generic: hh_trafo_kernel_8_generic")
-#endif
-
-  end subroutine hh_trafo_kernel_8_generic
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_4_generic(q, hh, nb, ldq, ldh, s)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq/2,*)
-    real(kind=rk), intent(in)       :: hh(ldh,*)
-#else
-    real(kind=rk), intent(inout)    :: q(:,:) !q(1:ldq/2,1:nb+1)
-    real(kind=rk), intent(in)       :: hh(ldh,2)
-#endif
-    real(kind=rk), intent(in)       :: s
-
-#ifdef PACK_REAL_TO_COMPLEX
-    complex(kind=ck)                :: x1, x2, y1, y2
-#else
-    real(kind=rk)                   :: x1, x2, x3, x4, y1, y2, y3, y4
-#endif
-    real(kind=rk)                   :: h1, h2, tau1, tau2
-    integer(kind=ik)                :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%start("kernel generic: hh_trafo_kernel_4_generic")
-#endif
-    x1 = q(1,2)
-    x2 = q(2,2)
-#ifndef PACK_REAL_TO_COMPLEX
-    x3 = q(3,2)
-    x4 = q(4,2)
-#endif
-
-    y1 = q(1,1) + q(1,2)*hh(2,2)
-    y2 = q(2,1) + q(2,2)*hh(2,2)
-#ifndef PACK_REAL_TO_COMPLEX
-    y3 = q(3,1) + q(3,2)*hh(2,2)
-    y4 = q(4,1) + q(4,2)*hh(2,2)
-#endif
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       x1 = x1 + q(1,i)*h1
-       y1 = y1 + q(1,i)*h2
-       x2 = x2 + q(2,i)*h1
-       y2 = y2 + q(2,i)*h2
-#ifndef PACK_REAL_TO_COMPLEX
-       x3 = x3 + q(3,i)*h1
-       y3 = y3 + q(3,i)*h2
-       x4 = x4 + q(4,i)*h1
-       y4 = y4 + q(4,i)*h2
-#endif
-    enddo
-
-    x1 = x1 + q(1,nb+1)*hh(nb,1)
-    x2 = x2 + q(2,nb+1)*hh(nb,1)
-#ifndef PACK_REAL_TO_COMPLEX
-    x3 = x3 + q(3,nb+1)*hh(nb,1)
-    x4 = x4 + q(4,nb+1)*hh(nb,1)
-#endif
-
-    tau1 = hh(1,1)
-    tau2 = hh(1,2)
-
-    h1 = -tau1
-    x1 = x1*h1
-    x2 = x2*h1
-#ifndef PACK_REAL_TO_COMPLEX
-    x3 = x3*h1
-    x4 = x4*h1
-#endif
-    h1 = -tau2
-    h2 = -tau2*s
-    y1 = y1*h1 + x1*h2
-    y2 = y2*h1 + x2*h2
-#ifndef PACK_REAL_TO_COMPLEX
-    y3 = y3*h1 + x3*h2
-    y4 = y4*h1 + x4*h2
-#endif
-
-    q(1,1) = q(1,1) + y1
-    q(2,1) = q(2,1) + y2
-#ifndef PACK_REAL_TO_COMPLEX
-    q(3,1) = q(3,1) + y3
-    q(4,1) = q(4,1) + y4
-#endif
-    q(1,2) = q(1,2) + x1 + y1*hh(2,2)
-    q(2,2) = q(2,2) + x2 + y2*hh(2,2)
-#ifndef PACK_REAL_TO_COMPLEX
-    q(3,2) = q(3,2) + x3 + y3*hh(2,2)
-    q(4,2) = q(4,2) + x4 + y4*hh(2,2)
-#endif
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       q(1,i) = q(1,i) + x1*h1 + y1*h2
-       q(2,i) = q(2,i) + x2*h1 + y2*h2
-#ifndef PACK_REAL_TO_COMPLEX
-       q(3,i) = q(3,i) + x3*h1 + y3*h2
-       q(4,i) = q(4,i) + x4*h1 + y4*h2
-#endif
-    enddo
-
-    q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
-    q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
-#ifndef PACK_REAL_TO_COMPLEX
-    q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
-    q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%stop("kernel generic: hh_trafo_kernel_4_generic")
-#endif
-
-  end subroutine hh_trafo_kernel_4_generic
-#ifndef DESPERATELY_WANT_ASSUMED_SIZE
-end module real_generic_kernel
-#endif
-! --------------------------------------------------------------------------------------------------
diff --git a/src/elpa2_kernels/elpa2_kernels_real_bgp.f90 b/src/elpa2_kernels/elpa2_kernels_real_bgp.f90
deleted file mode 100644
index f73c263d2..000000000
--- a/src/elpa2_kernels/elpa2_kernels_real_bgp.f90
+++ /dev/null
@@ -1,799 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! --------------------------------------------------------------------------------------------------
-!
-! This file contains the compute intensive kernels for the Householder transformations.
-!
-! *** Special IBM BlueGene/P version with BlueGene assembler instructions in Fortran ***
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-! --------------------------------------------------------------------------------------------------
-!module real_bgp_kernel
-
-!  private
-!  public double_hh_trafo_bgp
-!contains
-  subroutine double_hh_trafo_bgp(q, hh, nb, nq, ldq, ldh)
-    use precision
-
-    implicit none
-
-    integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
-    real(kind=rk), intent(inout) :: q(ldq,*)
-    real(kind=rk), intent(in)    :: hh(ldh,*)
-
-    real(kind=rk)                :: s
-    integer(kind=ik)             :: i
-
-    ! Safety only:
-
-    if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
-    if(mod(loc(q),16) /= 0) STOP 'Q unaligned!'
-
-    ! Calculate dot product of the two Householder vectors
-
-    s = hh(2,2)*1
-    do i=3,nb
-       s = s+hh(i,2)*hh(i-1,1)
-    enddo
-
-    do i=1,nq-16,20
-       call hh_trafo_kernel_10_bgp(q(i   ,1), hh, nb, ldq, ldh, s)
-       call hh_trafo_kernel_10_bgp(q(i+10,1), hh, nb, ldq, ldh, s)
-    enddo
-
-    ! i > nq-16 now, i.e. at most 16 rows remain
-
-    if(nq-i+1 > 12) then
-       call hh_trafo_kernel_8_bgp(q(i  ,1), hh, nb, ldq, ldh, s)
-       call hh_trafo_kernel_8_bgp(q(i+8,1), hh, nb, ldq, ldh, s)
-    else if(nq-i+1 > 8) then
-       call hh_trafo_kernel_8_bgp(q(i  ,1), hh, nb, ldq, ldh, s)
-       call hh_trafo_kernel_4_bgp(q(i+8,1), hh, nb, ldq, ldh, s)
-    else if(nq-i+1 > 4) then
-       call hh_trafo_kernel_8_bgp(q(i  ,1), hh, nb, ldq, ldh, s)
-    else if(nq-i+1 > 0) then
-       call hh_trafo_kernel_4_bgp(q(i  ,1), hh, nb, ldq, ldh, s)
-    endif
-
-  end subroutine double_hh_trafo_bgp
-
-  ! --------------------------------------------------------------------------------------------------
-  ! The following kernels perform the Householder transformation on Q for 10/8/4 rows.
-  ! Please note that Q is declared complex*16 here.
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_10_bgp(q, hh, nb, ldq, ldh, s)
-
-    use precision
-    implicit none
-
-    include 'mpif.h'
-
-    integer(kind=ik), intent(in)    :: nb, ldq, ldh
-    complex(kind=ck), intent(inout) :: q(ldq/2,*)
-    real(kind=rk), intent(in)       :: hh(ldh,*), s
-
-    complex(kind=ck)                :: x1, x2, x3, x4, x5, y1, y2, y3, y4, y5, q1, q2, q3, q4, q5, p1, p2, p3, p4, p5
-    real(kind=rk)                   :: h1, h2
-    integer(kind=ik)                :: i
-
-    !   complex*16 loadfp, fxcpmadd, fxpmul, fpadd, a, b
-    !   real*8 x
-    !   loadfp(a) = a
-    !   fxcpmadd(a,b,x) = a + b*x
-    !   fxpmul(a,x) = a*x
-    !   fpadd(a,b) = a+b
-    !
-    call alignx(16,q)
-
-
-    x1 = loadfp(q(1,2))
-    x2 = loadfp(q(2,2))
-    x3 = loadfp(q(3,2))
-    x4 = loadfp(q(4,2))
-    x5 = loadfp(q(5,2))
-
-    h2 = hh(2,2)
-    y1 = loadfp(q(1,1))
-    y2 = loadfp(q(2,1))
-    y3 = loadfp(q(3,1))
-    y4 = loadfp(q(4,1))
-    y5 = loadfp(q(5,1))
-    y1 = fxcpmadd(y1,x1,h2)
-    q1 = loadfp(q(1,3))
-    y2 = fxcpmadd(y2,x2,h2)
-    q2 = loadfp(q(2,3))
-    y3 = fxcpmadd(y3,x3,h2)
-    q3 = loadfp(q(3,3))
-    y4 = fxcpmadd(y4,x4,h2)
-    q4 = loadfp(q(4,3))
-    y5 = fxcpmadd(y5,x5,h2)
-    q5 = loadfp(q(5,3))
-
-    h1 = hh(3-1,1)
-
-    do i=3,nb,2
-
-       h2 = hh(i,2)
-
-       x1 = fxcpmadd(x1,q1,h1)
-       x2 = fxcpmadd(x2,q2,h1)
-       x3 = fxcpmadd(x3,q3,h1)
-       x4 = fxcpmadd(x4,q4,h1)
-       x5 = fxcpmadd(x5,q5,h1)
-
-       h1 = hh(i  ,1)
-
-       y1 = fxcpmadd(y1,q1,h2)
-       q1 = loadfp(q(1,i+1))
-       y2 = fxcpmadd(y2,q2,h2)
-       q2 = loadfp(q(2,i+1))
-       y3 = fxcpmadd(y3,q3,h2)
-       q3 = loadfp(q(3,i+1))
-       y4 = fxcpmadd(y4,q4,h2)
-       q4 = loadfp(q(4,i+1))
-       y5 = fxcpmadd(y5,q5,h2)
-       q5 = loadfp(q(5,i+1))
-
-       if(i==nb) exit
-
-       h2 = hh(i+1,2)
-
-       x1 = fxcpmadd(x1,q1,h1)
-       x2 = fxcpmadd(x2,q2,h1)
-       x3 = fxcpmadd(x3,q3,h1)
-       x4 = fxcpmadd(x4,q4,h1)
-       x5 = fxcpmadd(x5,q5,h1)
-
-       h1 = hh(i+1,1)
-
-       y1 = fxcpmadd(y1,q1,h2)
-       q1 = loadfp(q(1,i+2))
-       y2 = fxcpmadd(y2,q2,h2)
-       q2 = loadfp(q(2,i+2))
-       y3 = fxcpmadd(y3,q3,h2)
-       q3 = loadfp(q(3,i+2))
-       y4 = fxcpmadd(y4,q4,h2)
-       q4 = loadfp(q(4,i+2))
-       y5 = fxcpmadd(y5,q5,h2)
-       q5 = loadfp(q(5,i+2))
-
-    enddo
-
-    x1 = fxcpmadd(x1,q1,h1)
-    x2 = fxcpmadd(x2,q2,h1)
-    x3 = fxcpmadd(x3,q3,h1)
-    x4 = fxcpmadd(x4,q4,h1)
-    x5 = fxcpmadd(x5,q5,h1)
-
-    h1 = -hh(1,1) ! for below
-    h2 = -hh(1,2)
-    x1 = fxpmul(x1,h1)
-    x2 = fxpmul(x2,h1)
-    x3 = fxpmul(x3,h1)
-    x4 = fxpmul(x4,h1)
-    x5 = fxpmul(x5,h1)
-    h1 = -hh(1,2)*s
-    y1 = fxpmul(y1,h2)
-    y2 = fxpmul(y2,h2)
-    y3 = fxpmul(y3,h2)
-    y4 = fxpmul(y4,h2)
-    y5 = fxpmul(y5,h2)
-    y1 = fxcpmadd(y1,x1,h1)
-    q1 = loadfp(q(1,1))
-    y2 = fxcpmadd(y2,x2,h1)
-    q2 = loadfp(q(2,1))
-    y3 = fxcpmadd(y3,x3,h1)
-    q3 = loadfp(q(3,1))
-    y4 = fxcpmadd(y4,x4,h1)
-    q4 = loadfp(q(4,1))
-    y5 = fxcpmadd(y5,x5,h1)
-    q5 = loadfp(q(5,1))
-
-    q1 = fpadd(q1,y1)
-    p1 = loadfp(q(1,2))
-    q2 = fpadd(q2,y2)
-    p2 = loadfp(q(2,2))
-    q3 = fpadd(q3,y3)
-    p3 = loadfp(q(3,2))
-    q4 = fpadd(q4,y4)
-    p4 = loadfp(q(4,2))
-    q5 = fpadd(q5,y5)
-    p5 = loadfp(q(5,2))
-
-    h2 = hh(2,2)
-
-    call storefp(q(1,1),q1)
-    p1 = fpadd(p1,x1)
-    call storefp(q(2,1),q2)
-    p2 = fpadd(p2,x2)
-    call storefp(q(3,1),q3)
-    p3 = fpadd(p3,x3)
-    call storefp(q(4,1),q4)
-    p4 = fpadd(p4,x4)
-    call storefp(q(5,1),q5)
-    p5 = fpadd(p5,x5)
-
-    p1 = fxcpmadd(p1,y1,h2)
-    q1 = loadfp(q(1,3))
-    p2 = fxcpmadd(p2,y2,h2)
-    q2 = loadfp(q(2,3))
-    p3 = fxcpmadd(p3,y3,h2)
-    q3 = loadfp(q(3,3))
-    p4 = fxcpmadd(p4,y4,h2)
-    q4 = loadfp(q(4,3))
-    p5 = fxcpmadd(p5,y5,h2)
-    q5 = loadfp(q(5,3))
-
-    h1 = hh(3-1,1)
-
-    do i=3,nb,2
-
-       h2 = hh(i,2)
-
-       call storefp(q(1,i-1),p1)
-       q1 = fxcpmadd(q1,x1,h1)
-       call storefp(q(2,i-1),p2)
-       q2 = fxcpmadd(q2,x2,h1)
-       call storefp(q(3,i-1),p3)
-       q3 = fxcpmadd(q3,x3,h1)
-       call storefp(q(4,i-1),p4)
-       q4 = fxcpmadd(q4,x4,h1)
-       call storefp(q(5,i-1),p5)
-       q5 = fxcpmadd(q5,x5,h1)
-
-       h1 = hh(i,1)
-
-       q1 = fxcpmadd(q1,y1,h2)
-       p1 = loadfp(q(1,i+1))
-       q2 = fxcpmadd(q2,y2,h2)
-       p2 = loadfp(q(2,i+1))
-       q3 = fxcpmadd(q3,y3,h2)
-       p3 = loadfp(q(3,i+1))
-       q4 = fxcpmadd(q4,y4,h2)
-       p4 = loadfp(q(4,i+1))
-       q5 = fxcpmadd(q5,y5,h2)
-       p5 = loadfp(q(5,i+1))
-
-       if(i==nb) exit
-
-       h2 = hh(i+1,2)
-
-       call storefp(q(1,i),q1)
-       p1 = fxcpmadd(p1,x1,h1)
-       call storefp(q(2,i),q2)
-       p2 = fxcpmadd(p2,x2,h1)
-       call storefp(q(3,i),q3)
-       p3 = fxcpmadd(p3,x3,h1)
-       call storefp(q(4,i),q4)
-       p4 = fxcpmadd(p4,x4,h1)
-       call storefp(q(5,i),q5)
-       p5 = fxcpmadd(p5,x5,h1)
-
-       h1 = hh(i+1,1)
-
-       p1 = fxcpmadd(p1,y1,h2)
-       q1 = loadfp(q(1,i+2))
-       p2 = fxcpmadd(p2,y2,h2)
-       q2 = loadfp(q(2,i+2))
-       p3 = fxcpmadd(p3,y3,h2)
-       q3 = loadfp(q(3,i+2))
-       p4 = fxcpmadd(p4,y4,h2)
-       q4 = loadfp(q(4,i+2))
-       p5 = fxcpmadd(p5,y5,h2)
-       q5 = loadfp(q(5,i+2))
-
-    enddo
-
-
-    if(i==nb) then
-       call storefp(q(1,nb),q1)
-       p1 = fxcpmadd(p1,x1,h1)
-       call storefp(q(2,nb),q2)
-       p2 = fxcpmadd(p2,x2,h1)
-       call storefp(q(3,nb),q3)
-       p3 = fxcpmadd(p3,x3,h1)
-       call storefp(q(4,nb),q4)
-       p4 = fxcpmadd(p4,x4,h1)
-       call storefp(q(5,nb),q5)
-       p5 = fxcpmadd(p5,x5,h1)
-
-       call storefp(q(1,nb+1),p1)
-       call storefp(q(2,nb+1),p2)
-       call storefp(q(3,nb+1),p3)
-       call storefp(q(4,nb+1),p4)
-       call storefp(q(5,nb+1),p5)
-    else
-       call storefp(q(1,nb),p1)
-       q1 = fxcpmadd(q1,x1,h1)
-       call storefp(q(2,nb),p2)
-       q2 = fxcpmadd(q2,x2,h1)
-       call storefp(q(3,nb),p3)
-       q3 = fxcpmadd(q3,x3,h1)
-       call storefp(q(4,nb),p4)
-       q4 = fxcpmadd(q4,x4,h1)
-       call storefp(q(5,nb),p5)
-       q5 = fxcpmadd(q5,x5,h1)
-
-       call storefp(q(1,nb+1),q1)
-       call storefp(q(2,nb+1),q2)
-       call storefp(q(3,nb+1),q3)
-       call storefp(q(4,nb+1),q4)
-       call storefp(q(5,nb+1),q5)
-    endif
-
-
-    !contains
-    !
-    !   subroutine storefp(a,b)
-    !      complex*16 a, b
-    !
-    !      a = b
-    !   end subroutine
-    !   subroutine alignx(n, x)
-    !      integer n
-    !      complex*16 x(ldq/2,*)
-    !   end subroutine
-
-  end subroutine hh_trafo_kernel_10_bgp
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_8_bgp(q, hh, nb, ldq, ldh, s)
-
-    use precision
-    implicit none
-
-    include 'mpif.h'
-
-    integer(kind=ik), intent(in)    :: nb, ldq, ldh
-    complex(kind=ck), intent(inout) :: q(ldq/2,*)
-    real(kind=rk), intent(in)       :: hh(ldh,*), s
-
-    complex(kind=ck)                :: x1, x2, x3, x4, y1, y2, y3, y4, q1, q2, q3, q4, p1, p2, p3, p4
-    real(kind=rk)                   :: h1, h2
-    integer(kind=ik)                :: i
-
-    !   complex*16 loadfp, fxcpmadd, fxpmul, fpadd, a, b
-    !   real*8 x
-    !   loadfp(a) = a
-    !   fxcpmadd(a,b,x) = a + b*x
-    !   fxpmul(a,x) = a*x
-    !   fpadd(a,b) = a+b
-
-    call alignx(16,q)
-
-
-    x1 = loadfp(q(1,2))
-    x2 = loadfp(q(2,2))
-    x3 = loadfp(q(3,2))
-    x4 = loadfp(q(4,2))
-
-    h2 = hh(2,2)
-    y1 = loadfp(q(1,1))
-    y2 = loadfp(q(2,1))
-    y3 = loadfp(q(3,1))
-    y4 = loadfp(q(4,1))
-    y1 = fxcpmadd(y1,x1,h2)
-    q1 = loadfp(q(1,3))
-    y2 = fxcpmadd(y2,x2,h2)
-    q2 = loadfp(q(2,3))
-    y3 = fxcpmadd(y3,x3,h2)
-    q3 = loadfp(q(3,3))
-    y4 = fxcpmadd(y4,x4,h2)
-    q4 = loadfp(q(4,3))
-
-    h1 = hh(3-1,1)
-
-    do i=3,nb,2
-
-       h2 = hh(i,2)
-
-       x1 = fxcpmadd(x1,q1,h1)
-       x2 = fxcpmadd(x2,q2,h1)
-       x3 = fxcpmadd(x3,q3,h1)
-       x4 = fxcpmadd(x4,q4,h1)
-
-       h1 = hh(i  ,1)
-
-       y1 = fxcpmadd(y1,q1,h2)
-       q1 = loadfp(q(1,i+1))
-       y2 = fxcpmadd(y2,q2,h2)
-       q2 = loadfp(q(2,i+1))
-       y3 = fxcpmadd(y3,q3,h2)
-       q3 = loadfp(q(3,i+1))
-       y4 = fxcpmadd(y4,q4,h2)
-       q4 = loadfp(q(4,i+1))
-
-       if(i==nb) exit
-
-       h2 = hh(i+1,2)
-
-       x1 = fxcpmadd(x1,q1,h1)
-       x2 = fxcpmadd(x2,q2,h1)
-       x3 = fxcpmadd(x3,q3,h1)
-       x4 = fxcpmadd(x4,q4,h1)
-
-       h1 = hh(i+1,1)
-
-       y1 = fxcpmadd(y1,q1,h2)
-       q1 = loadfp(q(1,i+2))
-       y2 = fxcpmadd(y2,q2,h2)
-       q2 = loadfp(q(2,i+2))
-       y3 = fxcpmadd(y3,q3,h2)
-       q3 = loadfp(q(3,i+2))
-       y4 = fxcpmadd(y4,q4,h2)
-       q4 = loadfp(q(4,i+2))
-
-    enddo
-
-    x1 = fxcpmadd(x1,q1,h1)
-    x2 = fxcpmadd(x2,q2,h1)
-    x3 = fxcpmadd(x3,q3,h1)
-    x4 = fxcpmadd(x4,q4,h1)
-
-    h1 = -hh(1,1) ! for below
-    h2 = -hh(1,2)
-    x1 = fxpmul(x1,h1)
-    x2 = fxpmul(x2,h1)
-    x3 = fxpmul(x3,h1)
-    x4 = fxpmul(x4,h1)
-    h1 = -hh(1,2)*s
-    y1 = fxpmul(y1,h2)
-    y2 = fxpmul(y2,h2)
-    y3 = fxpmul(y3,h2)
-    y4 = fxpmul(y4,h2)
-    y1 = fxcpmadd(y1,x1,h1)
-    q1 = loadfp(q(1,1))
-    y2 = fxcpmadd(y2,x2,h1)
-    q2 = loadfp(q(2,1))
-    y3 = fxcpmadd(y3,x3,h1)
-    q3 = loadfp(q(3,1))
-    y4 = fxcpmadd(y4,x4,h1)
-    q4 = loadfp(q(4,1))
-
-    q1 = fpadd(q1,y1)
-    p1 = loadfp(q(1,2))
-    q2 = fpadd(q2,y2)
-    p2 = loadfp(q(2,2))
-    q3 = fpadd(q3,y3)
-    p3 = loadfp(q(3,2))
-    q4 = fpadd(q4,y4)
-    p4 = loadfp(q(4,2))
-
-    h2 = hh(2,2)
-
-    call storefp(q(1,1),q1)
-    p1 = fpadd(p1,x1)
-    call storefp(q(2,1),q2)
-    p2 = fpadd(p2,x2)
-    call storefp(q(3,1),q3)
-    p3 = fpadd(p3,x3)
-    call storefp(q(4,1),q4)
-    p4 = fpadd(p4,x4)
-
-    p1 = fxcpmadd(p1,y1,h2)
-    q1 = loadfp(q(1,3))
-    p2 = fxcpmadd(p2,y2,h2)
-    q2 = loadfp(q(2,3))
-    p3 = fxcpmadd(p3,y3,h2)
-    q3 = loadfp(q(3,3))
-    p4 = fxcpmadd(p4,y4,h2)
-    q4 = loadfp(q(4,3))
-
-    h1 = hh(3-1,1)
-
-    do i=3,nb,2
-
-       h2 = hh(i,2)
-
-       call storefp(q(1,i-1),p1)
-       q1 = fxcpmadd(q1,x1,h1)
-       call storefp(q(2,i-1),p2)
-       q2 = fxcpmadd(q2,x2,h1)
-       call storefp(q(3,i-1),p3)
-       q3 = fxcpmadd(q3,x3,h1)
-       call storefp(q(4,i-1),p4)
-       q4 = fxcpmadd(q4,x4,h1)
-
-       h1 = hh(i,1)
-
-       q1 = fxcpmadd(q1,y1,h2)
-       p1 = loadfp(q(1,i+1))
-       q2 = fxcpmadd(q2,y2,h2)
-       p2 = loadfp(q(2,i+1))
-       q3 = fxcpmadd(q3,y3,h2)
-       p3 = loadfp(q(3,i+1))
-       q4 = fxcpmadd(q4,y4,h2)
-       p4 = loadfp(q(4,i+1))
-
-       if(i==nb) exit
-
-       h2 = hh(i+1,2)
-
-       call storefp(q(1,i),q1)
-       p1 = fxcpmadd(p1,x1,h1)
-       call storefp(q(2,i),q2)
-       p2 = fxcpmadd(p2,x2,h1)
-       call storefp(q(3,i),q3)
-       p3 = fxcpmadd(p3,x3,h1)
-       call storefp(q(4,i),q4)
-       p4 = fxcpmadd(p4,x4,h1)
-
-       h1 = hh(i+1,1)
-
-       p1 = fxcpmadd(p1,y1,h2)
-       q1 = loadfp(q(1,i+2))
-       p2 = fxcpmadd(p2,y2,h2)
-       q2 = loadfp(q(2,i+2))
-       p3 = fxcpmadd(p3,y3,h2)
-       q3 = loadfp(q(3,i+2))
-       p4 = fxcpmadd(p4,y4,h2)
-       q4 = loadfp(q(4,i+2))
-
-    enddo
-
-
-    if(i==nb) then
-       call storefp(q(1,nb),q1)
-       p1 = fxcpmadd(p1,x1,h1)
-       call storefp(q(2,nb),q2)
-       p2 = fxcpmadd(p2,x2,h1)
-       call storefp(q(3,nb),q3)
-       p3 = fxcpmadd(p3,x3,h1)
-       call storefp(q(4,nb),q4)
-       p4 = fxcpmadd(p4,x4,h1)
-
-       call storefp(q(1,nb+1),p1)
-       call storefp(q(2,nb+1),p2)
-       call storefp(q(3,nb+1),p3)
-       call storefp(q(4,nb+1),p4)
-    else
-       call storefp(q(1,nb),p1)
-       q1 = fxcpmadd(q1,x1,h1)
-       call storefp(q(2,nb),p2)
-       q2 = fxcpmadd(q2,x2,h1)
-       call storefp(q(3,nb),p3)
-       q3 = fxcpmadd(q3,x3,h1)
-       call storefp(q(4,nb),p4)
-       q4 = fxcpmadd(q4,x4,h1)
-
-       call storefp(q(1,nb+1),q1)
-       call storefp(q(2,nb+1),q2)
-       call storefp(q(3,nb+1),q3)
-       call storefp(q(4,nb+1),q4)
-    endif
-
-
-    !contains
-    !
-    !   subroutine storefp(a,b)
-    !      complex*16 a, b
-    !
-    !      a = b
-    !   end subroutine
-    !   subroutine alignx(n, x)
-    !      integer n
-    !      complex*16 x(ldq/2,*)
-    !   end subroutine
-
-  end subroutine hh_trafo_kernel_8_bgp
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_4_bgp(q, hh, nb, ldq, ldh, s)
-
-    use precision
-    implicit none
-
-    include 'mpif.h'
-
-    integer(kind=ik), intent(in)    :: nb, ldq, ldh
-    complex(kind=ck), intent(inout) :: q(ldq/2,*)
-    real(kind=rk), intent(in)       :: hh(ldh,*), s
-
-    complex(kind=ck)                :: x1, x2, y1, y2, q1, q2, p1, p2
-    real(kind=rk)                   :: h1, h2
-    integer(kind=ik)                :: i
-
-    !   complex*16 loadfp, fxcpmadd, fxpmul, fpadd, a, b
-    !   real*8 x
-    !   loadfp(a) = a
-    !   fxcpmadd(a,b,x) = a + b*x
-    !   fxpmul(a,x) = a*x
-    !   fpadd(a,b) = a+b
-
-    call alignx(16,q)
-
-
-    x1 = loadfp(q(1,2))
-    x2 = loadfp(q(2,2))
-
-    h2 = hh(2,2)
-    y1 = loadfp(q(1,1))
-    y2 = loadfp(q(2,1))
-    y1 = fxcpmadd(y1,x1,h2)
-    q1 = loadfp(q(1,3))
-    y2 = fxcpmadd(y2,x2,h2)
-    q2 = loadfp(q(2,3))
-
-    h1 = hh(3-1,1)
-
-    do i=3,nb,2
-
-       h2 = hh(i,2)
-
-       x1 = fxcpmadd(x1,q1,h1)
-       x2 = fxcpmadd(x2,q2,h1)
-
-       h1 = hh(i  ,1)
-
-       y1 = fxcpmadd(y1,q1,h2)
-       q1 = loadfp(q(1,i+1))
-       y2 = fxcpmadd(y2,q2,h2)
-       q2 = loadfp(q(2,i+1))
-
-       if(i==nb) exit
-
-       h2 = hh(i+1,2)
-
-       x1 = fxcpmadd(x1,q1,h1)
-       x2 = fxcpmadd(x2,q2,h1)
-
-       h1 = hh(i+1,1)
-
-       y1 = fxcpmadd(y1,q1,h2)
-       q1 = loadfp(q(1,i+2))
-       y2 = fxcpmadd(y2,q2,h2)
-       q2 = loadfp(q(2,i+2))
-
-    enddo
-
-    x1 = fxcpmadd(x1,q1,h1)
-    x2 = fxcpmadd(x2,q2,h1)
-
-    h1 = -hh(1,1) ! for below
-    h2 = -hh(1,2)
-    x1 = fxpmul(x1,h1)
-    x2 = fxpmul(x2,h1)
-    h1 = -hh(1,2)*s
-    y1 = fxpmul(y1,h2)
-    y2 = fxpmul(y2,h2)
-    y1 = fxcpmadd(y1,x1,h1)
-    q1 = loadfp(q(1,1))
-    y2 = fxcpmadd(y2,x2,h1)
-    q2 = loadfp(q(2,1))
-
-    q1 = fpadd(q1,y1)
-    p1 = loadfp(q(1,2))
-    q2 = fpadd(q2,y2)
-    p2 = loadfp(q(2,2))
-
-    h2 = hh(2,2)
-
-    call storefp(q(1,1),q1)
-    p1 = fpadd(p1,x1)
-    call storefp(q(2,1),q2)
-    p2 = fpadd(p2,x2)
-
-    p1 = fxcpmadd(p1,y1,h2)
-    q1 = loadfp(q(1,3))
-    p2 = fxcpmadd(p2,y2,h2)
-    q2 = loadfp(q(2,3))
-
-    h1 = hh(3-1,1)
-
-    do i=3,nb,2
-
-       h2 = hh(i,2)
-
-       call storefp(q(1,i-1),p1)
-       q1 = fxcpmadd(q1,x1,h1)
-       call storefp(q(2,i-1),p2)
-       q2 = fxcpmadd(q2,x2,h1)
-
-       h1 = hh(i,1)
-
-       q1 = fxcpmadd(q1,y1,h2)
-       p1 = loadfp(q(1,i+1))
-       q2 = fxcpmadd(q2,y2,h2)
-       p2 = loadfp(q(2,i+1))
-
-       if(i==nb) exit
-
-       h2 = hh(i+1,2)
-
-       call storefp(q(1,i),q1)
-       p1 = fxcpmadd(p1,x1,h1)
-       call storefp(q(2,i),q2)
-       p2 = fxcpmadd(p2,x2,h1)
-
-       h1 = hh(i+1,1)
-
-       p1 = fxcpmadd(p1,y1,h2)
-       q1 = loadfp(q(1,i+2))
-       p2 = fxcpmadd(p2,y2,h2)
-       q2 = loadfp(q(2,i+2))
-
-    enddo
-
-
-    if(i==nb) then
-       call storefp(q(1,nb),q1)
-       p1 = fxcpmadd(p1,x1,h1)
-       call storefp(q(2,nb),q2)
-       p2 = fxcpmadd(p2,x2,h1)
-
-       call storefp(q(1,nb+1),p1)
-       call storefp(q(2,nb+1),p2)
-    else
-       call storefp(q(1,nb),p1)
-       q1 = fxcpmadd(q1,x1,h1)
-       call storefp(q(2,nb),p2)
-       q2 = fxcpmadd(q2,x2,h1)
-
-       call storefp(q(1,nb+1),q1)
-       call storefp(q(2,nb+1),q2)
-    endif
-
-
-    !contains
-    !
-    !   subroutine storefp(a,b)
-    !      complex*16 a, b
-    !
-    !      a = b
-    !   end subroutine
-    !   subroutine alignx(n, x)
-    !      integer n
-    !      complex*16 x(ldq/2,*)
-    !   end subroutine
-
-  end subroutine hh_trafo_kernel_4_bgp
-!end module real_bgp_kernel
-! --------------------------------------------------------------------------------------------------
diff --git a/src/elpa2_kernels/elpa2_kernels_real_bgq.f90 b/src/elpa2_kernels/elpa2_kernels_real_bgq.f90
deleted file mode 100644
index 9b1c3bbb3..000000000
--- a/src/elpa2_kernels/elpa2_kernels_real_bgq.f90
+++ /dev/null
@@ -1,662 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! --------------------------------------------------------------------------------------------------
-!
-! This file contains the compute intensive kernels for the Householder transformations.
-!
-! *** Special IBM BlueGene/Q version with QPX intrinsics in Fortran ***
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-! --------------------------------------------------------------------------------------------------
-module real_bgq_kernel
-
-  private
-  public double_hh_trafo_bgq
-contains
-  subroutine double_hh_trafo_bgq(q, hh, nb, nq, ldq, ldh)
-    use precision
-    implicit none
-
-    integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
-    real(kind=rk), intent(inout) :: q(ldq,*)
-    real(kind=rk), intent(in)    :: hh(ldh,*)
-
-    real(kind=rk)                :: s
-    integer(kind=ik)             :: i
-
-    ! Safety only:
-
-    if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
-
-    call alignx(32,q)
-
-    ! Calculate dot product of the two Householder vectors
-
-    s = hh(2,2)*1
-    do i=3,nb
-       s = s+hh(i,2)*hh(i-1,1)
-    enddo
-
-    do i=1,nq-20,24
-       call hh_trafo_kernel_24_bgq(q(i   ,1), hh, nb, ldq, ldh, s)
-    enddo
-
-    if(nq-i+1 > 16) then
-       call hh_trafo_kernel_16_bgq(q(i  ,1), hh, nb, ldq, ldh, s)
-       call hh_trafo_kernel_4_bgq(q(i+16,1), hh, nb, ldq, ldh, s)
-    else if(nq-i+1 > 12) then
-       call hh_trafo_kernel_8_bgq(q(i  ,1), hh, nb, ldq, ldh, s)
-       call hh_trafo_kernel_8_bgq(q(i+8,1), hh, nb, ldq, ldh, s)
-    else if(nq-i+1 > 8) then
-       call hh_trafo_kernel_8_bgq(q(i  ,1), hh, nb, ldq, ldh, s)
-       call hh_trafo_kernel_4_bgq(q(i+8,1), hh, nb, ldq, ldh, s)
-    else if(nq-i+1 > 4) then
-       call hh_trafo_kernel_8_bgq(q(i  ,1), hh, nb, ldq, ldh, s)
-    else if(nq-i+1 > 0) then
-       call hh_trafo_kernel_4_bgq(q(i  ,1), hh, nb, ldq, ldh, s)
-    endif
-
-  end subroutine double_hh_trafo_bgq
-
-
-  ! --------------------------------------------------------------------------------------------------
-  ! The following kernels perform the Householder transformation on Q for 24/16/8/4 rows.
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_24_bgq(q, hh, nb, ldq, ldh, s)
-    use precision
-    implicit none
-
-    include 'mpif.h'
-
-    integer(kind=ik), intent(in) :: nb, ldq, ldh
-
-    real(kind=rk), intent(inout) :: q(ldq,*)
-    real(kind=rk), intent(in)    :: hh(ldh,*), s
-
-    VECTOR(REAL(8))::QPX_x1, QPX_x2, QPX_x3, QPX_x4, QPX_x5, QPX_x6
-    VECTOR(REAL(8))::QPX_y1, QPX_y2, QPX_y3, QPX_y4, QPX_y5, QPX_y6
-    VECTOR(REAL(8))::QPX_q1, QPX_q2, QPX_q3, QPX_q4, QPX_q5, QPX_q6
-    VECTOR(REAL(8))::QPX_h1, QPX_h2, QPX_tau1, QPX_tau2, QPX_s
-    integer i
-
-    call alignx(32,q)
-
-    !--- multiply Householder vectors with matrix q ---
-
-    QPX_x1 = VEC_LD(0,q(1,2))
-    QPX_x2 = VEC_LD(0,q(5,2))
-    QPX_x3 = VEC_LD(0,q(9,2))
-    QPX_x4 = VEC_LD(0,q(13,2))
-    QPX_x5 = VEC_LD(0,q(17,2))
-    QPX_x6 = VEC_LD(0,q(21,2))
-
-    QPX_h2 = VEC_SPLATS(hh(2,2))
-    QPX_q1 = VEC_LD(0,q(1,1))
-    QPX_q2 = VEC_LD(0,q(5,1))
-    QPX_q3 = VEC_LD(0,q(9,1))
-    QPX_q4 = VEC_LD(0,q(13,1))
-    QPX_q5 = VEC_LD(0,q(17,1))
-    QPX_q6 = VEC_LD(0,q(21,1))
-    QPX_y1 = VEC_MADD(QPX_x1, QPX_h2, QPX_q1)
-    QPX_y2 = VEC_MADD(QPX_x2, QPX_h2, QPX_q2)
-    QPX_y3 = VEC_MADD(QPX_x3, QPX_h2, QPX_q3)
-    QPX_y4 = VEC_MADD(QPX_x4, QPX_h2, QPX_q4)
-    QPX_y5 = VEC_MADD(QPX_x5, QPX_h2, QPX_q5)
-    QPX_y6 = VEC_MADD(QPX_x6, QPX_h2, QPX_q6)
-
-    do i=3,nb,1
-
-       QPX_q1 = VEC_LD(0,q(1,i))
-       QPX_q2 = VEC_LD(0,q(5,i))
-       QPX_q3 = VEC_LD(0,q(9,i))
-       QPX_q4 = VEC_LD(0,q(13,i))
-       QPX_q5 = VEC_LD(0,q(17,i))
-       QPX_q6 = VEC_LD(0,q(21,i))
-       QPX_h1 = VEC_SPLATS(hh(i-1,1))
-       QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
-       QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2)
-       QPX_x3 = VEC_MADD(QPX_q3, QPX_h1, QPX_x3)
-       QPX_x4 = VEC_MADD(QPX_q4, QPX_h1, QPX_x4)
-       QPX_x5 = VEC_MADD(QPX_q5, QPX_h1, QPX_x5)
-       QPX_x6 = VEC_MADD(QPX_q6, QPX_h1, QPX_x6)
-       QPX_h2 = VEC_SPLATS(hh(i,2))
-       QPX_y1 = VEC_MADD(QPX_q1, QPX_h2, QPX_y1)
-       QPX_y2 = VEC_MADD(QPX_q2, QPX_h2, QPX_y2)
-       QPX_y3 = VEC_MADD(QPX_q3, QPX_h2, QPX_y3)
-       QPX_y4 = VEC_MADD(QPX_q4, QPX_h2, QPX_y4)
-       QPX_y5 = VEC_MADD(QPX_q5, QPX_h2, QPX_y5)
-       QPX_y6 = VEC_MADD(QPX_q6, QPX_h2, QPX_y6)
-
-    enddo
-
-    QPX_h1 = VEC_SPLATS(hh(nb,1))
-    QPX_q1 = VEC_LD(0,q(1,nb+1))
-    QPX_q2 = VEC_LD(0,q(5,nb+1))
-    QPX_q3 = VEC_LD(0,q(9,nb+1))
-    QPX_q4 = VEC_LD(0,q(13,nb+1))
-    QPX_q5 = VEC_LD(0,q(17,nb+1))
-    QPX_q6 = VEC_LD(0,q(21,nb+1))
-    QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
-    QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2)
-    QPX_x3 = VEC_MADD(QPX_q3, QPX_h1, QPX_x3)
-    QPX_x4 = VEC_MADD(QPX_q4, QPX_h1, QPX_x4)
-    QPX_x5 = VEC_MADD(QPX_q5, QPX_h1, QPX_x5)
-    QPX_x6 = VEC_MADD(QPX_q6, QPX_h1, QPX_x6)
-
-    !--- multiply T matrix ---
-
-    QPX_tau1 = VEC_SPLATS(-hh(1,1))
-    QPX_x1 = VEC_MUL(QPX_x1, QPX_tau1)
-    QPX_x2 = VEC_MUL(QPX_x2, QPX_tau1)
-    QPX_x3 = VEC_MUL(QPX_x3, QPX_tau1)
-    QPX_x4 = VEC_MUL(QPX_x4, QPX_tau1)
-    QPX_x5 = VEC_MUL(QPX_x5, QPX_tau1)
-    QPX_x6 = VEC_MUL(QPX_x6, QPX_tau1)
-    QPX_tau2 = VEC_SPLATS(-hh(1,2))
-    QPX_s = VEC_SPLATS(-hh(1,2)*s)
-    QPX_y1 = VEC_MUL(QPX_y1, QPX_tau2)
-    QPX_y2 = VEC_MUL(QPX_y2, QPX_tau2)
-    QPX_y3 = VEC_MUL(QPX_y3, QPX_tau2)
-    QPX_y4 = VEC_MUL(QPX_y4, QPX_tau2)
-    QPX_y5 = VEC_MUL(QPX_y5, QPX_tau2)
-    QPX_y6 = VEC_MUL(QPX_y6, QPX_tau2)
-    QPX_y1 = VEC_MADD(QPX_x1, QPX_s, QPX_y1)
-    QPX_y2 = VEC_MADD(QPX_x2, QPX_s, QPX_y2)
-    QPX_y3 = VEC_MADD(QPX_x3, QPX_s, QPX_y3)
-    QPX_y4 = VEC_MADD(QPX_x4, QPX_s, QPX_y4)
-    QPX_y5 = VEC_MADD(QPX_x5, QPX_s, QPX_y5)
-    QPX_y6 = VEC_MADD(QPX_x6, QPX_s, QPX_y6)
-
-    !--- rank-2 update of q ---
-
-    QPX_q1 = VEC_LD(0,q(1,1))
-    QPX_q2 = VEC_LD(0,q(5,1))
-    QPX_q3 = VEC_LD(0,q(9,1))
-    QPX_q4 = VEC_LD(0,q(13,1))
-    QPX_q5 = VEC_LD(0,q(17,1))
-    QPX_q6 = VEC_LD(0,q(21,1))
-    QPX_q1 = VEC_ADD(QPX_q1, QPX_y1)
-    QPX_q2 = VEC_ADD(QPX_q2, QPX_y2)
-    QPX_q3 = VEC_ADD(QPX_q3, QPX_y3)
-    QPX_q4 = VEC_ADD(QPX_q4, QPX_y4)
-    QPX_q5 = VEC_ADD(QPX_q5, QPX_y5)
-    QPX_q6 = VEC_ADD(QPX_q6, QPX_y6)
-    call VEC_ST(QPX_q1, 0, q(1,1))
-    call VEC_ST(QPX_q2, 0, q(5,1))
-    call VEC_ST(QPX_q3, 0, q(9,1))
-    call VEC_ST(QPX_q4, 0, q(13,1))
-    call VEC_ST(QPX_q5, 0, q(17,1))
-    call VEC_ST(QPX_q6, 0, q(21,1))
-
-    QPX_h2 = VEC_SPLATS(hh(2,2))
-    QPX_q1 = VEC_LD(0,q(1,2))
-    QPX_q2 = VEC_LD(0,q(5,2))
-    QPX_q3 = VEC_LD(0,q(9,2))
-    QPX_q4 = VEC_LD(0,q(13,2))
-    QPX_q5 = VEC_LD(0,q(17,2))
-    QPX_q6 = VEC_LD(0,q(21,2))
-    QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
-    QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2)
-    QPX_q3 = VEC_MADD(QPX_y3, QPX_h2, QPX_q3)
-    QPX_q4 = VEC_MADD(QPX_y4, QPX_h2, QPX_q4)
-    QPX_q5 = VEC_MADD(QPX_y5, QPX_h2, QPX_q5)
-    QPX_q6 = VEC_MADD(QPX_y6, QPX_h2, QPX_q6)
-    QPX_q1 = VEC_ADD(QPX_q1, QPX_x1)
-    QPX_q2 = VEC_ADD(QPX_q2, QPX_x2)
-    QPX_q3 = VEC_ADD(QPX_q3, QPX_x3)
-    QPX_q4 = VEC_ADD(QPX_q4, QPX_x4)
-    QPX_q5 = VEC_ADD(QPX_q5, QPX_x5)
-    QPX_q6 = VEC_ADD(QPX_q6, QPX_x6)
-    call VEC_ST(QPX_q1, 0, q(1,2))
-    call VEC_ST(QPX_q2, 0, q(5,2))
-    call VEC_ST(QPX_q3, 0, q(9,2))
-    call VEC_ST(QPX_q4, 0, q(13,2))
-    call VEC_ST(QPX_q5, 0, q(17,2))
-    call VEC_ST(QPX_q6, 0, q(21,2))
-
-    do i=3,nb,1
-
-       QPX_q1 = VEC_LD(0,q(1,i))
-       QPX_q2 = VEC_LD(0,q(5,i))
-       QPX_q3 = VEC_LD(0,q(9,i))
-       QPX_q4 = VEC_LD(0,q(13,i))
-       QPX_q5 = VEC_LD(0,q(17,i))
-       QPX_q6 = VEC_LD(0,q(21,i))
-       QPX_h1 = VEC_SPLATS(hh(i-1,1))
-       QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
-       QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2)
-       QPX_q3 = VEC_MADD(QPX_x3, QPX_h1, QPX_q3)
-       QPX_q4 = VEC_MADD(QPX_x4, QPX_h1, QPX_q4)
-       QPX_q5 = VEC_MADD(QPX_x5, QPX_h1, QPX_q5)
-       QPX_q6 = VEC_MADD(QPX_x6, QPX_h1, QPX_q6)
-       QPX_h2 = VEC_SPLATS(hh(i,2))
-       QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
-       QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2)
-       QPX_q3 = VEC_MADD(QPX_y3, QPX_h2, QPX_q3)
-       QPX_q4 = VEC_MADD(QPX_y4, QPX_h2, QPX_q4)
-       QPX_q5 = VEC_MADD(QPX_y5, QPX_h2, QPX_q5)
-       QPX_q6 = VEC_MADD(QPX_y6, QPX_h2, QPX_q6)
-
-       call VEC_ST(QPX_q1, 0, q(1,i))
-       call VEC_ST(QPX_q2, 0, q(5,i))
-       call VEC_ST(QPX_q3, 0, q(9,i))
-       call VEC_ST(QPX_q4, 0, q(13,i))
-       call VEC_ST(QPX_q5, 0, q(17,i))
-       call VEC_ST(QPX_q6, 0, q(21,i))
-
-    enddo
-
-    QPX_h1 = VEC_SPLATS(hh(nb,1))
-    QPX_q1 = VEC_LD(0,q(1,nb+1))
-    QPX_q2 = VEC_LD(0,q(5,nb+1))
-    QPX_q3 = VEC_LD(0,q(9,nb+1))
-    QPX_q4 = VEC_LD(0,q(13,nb+1))
-    QPX_q5 = VEC_LD(0,q(17,nb+1))
-    QPX_q6 = VEC_LD(0,q(21,nb+1))
-    QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
-    QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2)
-    QPX_q3 = VEC_MADD(QPX_x3, QPX_h1, QPX_q3)
-    QPX_q4 = VEC_MADD(QPX_x4, QPX_h1, QPX_q4)
-    QPX_q5 = VEC_MADD(QPX_x5, QPX_h1, QPX_q5)
-    QPX_q6 = VEC_MADD(QPX_x6, QPX_h1, QPX_q6)
-    call VEC_ST(QPX_q1, 0, q(1,nb+1))
-    call VEC_ST(QPX_q2, 0, q(5,nb+1))
-    call VEC_ST(QPX_q3, 0, q(9,nb+1))
-    call VEC_ST(QPX_q4, 0, q(13,nb+1))
-    call VEC_ST(QPX_q5, 0, q(17,nb+1))
-    call VEC_ST(QPX_q6, 0, q(21,nb+1))
-
-  end subroutine hh_trafo_kernel_24_bgq
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_16_bgq(q, hh, nb, ldq, ldh, s)
-    use precision
-    implicit none
-
-    include 'mpif.h'
-
-    integer(kind=ik), intent(in) :: nb, ldq, ldh
-
-    real(kind=rk), intent(inout) :: q(ldq,*)
-    real(kind=rk), intent(in)    :: hh(ldh,*), s
-
-    VECTOR(REAL(8))::QPX_x1, QPX_x2, QPX_x3, QPX_x4
-    VECTOR(REAL(8))::QPX_y1, QPX_y2, QPX_y3, QPX_y4
-    VECTOR(REAL(8))::QPX_q1, QPX_q2, QPX_q3, QPX_q4
-    VECTOR(REAL(8))::QPX_h1, QPX_h2, QPX_tau1, QPX_tau2, QPX_s
-    integer i
-
-    call alignx(32,q)
-
-    !--- multiply Householder vectors with matrix q ---
-
-    QPX_x1 = VEC_LD(0,q(1,2))
-    QPX_x2 = VEC_LD(0,q(5,2))
-    QPX_x3 = VEC_LD(0,q(9,2))
-    QPX_x4 = VEC_LD(0,q(13,2))
-
-    QPX_h2 = VEC_SPLATS(hh(2,2))
-    QPX_q1 = VEC_LD(0,q(1,1))
-    QPX_q2 = VEC_LD(0,q(5,1))
-    QPX_q3 = VEC_LD(0,q(9,1))
-    QPX_q4 = VEC_LD(0,q(13,1))
-    QPX_y1 = VEC_MADD(QPX_x1, QPX_h2, QPX_q1)
-    QPX_y2 = VEC_MADD(QPX_x2, QPX_h2, QPX_q2)
-    QPX_y3 = VEC_MADD(QPX_x3, QPX_h2, QPX_q3)
-    QPX_y4 = VEC_MADD(QPX_x4, QPX_h2, QPX_q4)
-
-    do i=3,nb,1
-
-       QPX_q1 = VEC_LD(0,q(1,i))
-       QPX_q2 = VEC_LD(0,q(5,i))
-       QPX_q3 = VEC_LD(0,q(9,i))
-       QPX_q4 = VEC_LD(0,q(13,i))
-       QPX_h1 = VEC_SPLATS(hh(i-1,1))
-       QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
-       QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2)
-       QPX_x3 = VEC_MADD(QPX_q3, QPX_h1, QPX_x3)
-       QPX_x4 = VEC_MADD(QPX_q4, QPX_h1, QPX_x4)
-       QPX_h2 = VEC_SPLATS(hh(i,2))
-       QPX_y1 = VEC_MADD(QPX_q1, QPX_h2, QPX_y1)
-       QPX_y2 = VEC_MADD(QPX_q2, QPX_h2, QPX_y2)
-       QPX_y3 = VEC_MADD(QPX_q3, QPX_h2, QPX_y3)
-       QPX_y4 = VEC_MADD(QPX_q4, QPX_h2, QPX_y4)
-
-    enddo
-
-    QPX_h1 = VEC_SPLATS(hh(nb,1))
-    QPX_q1 = VEC_LD(0,q(1,nb+1))
-    QPX_q2 = VEC_LD(0,q(5,nb+1))
-    QPX_q3 = VEC_LD(0,q(9,nb+1))
-    QPX_q4 = VEC_LD(0,q(13,nb+1))
-    QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
-    QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2)
-    QPX_x3 = VEC_MADD(QPX_q3, QPX_h1, QPX_x3)
-    QPX_x4 = VEC_MADD(QPX_q4, QPX_h1, QPX_x4)
-
-    !--- multiply T matrix ---
-
-    QPX_tau1 = VEC_SPLATS(-hh(1,1))
-    QPX_x1 = VEC_MUL(QPX_x1, QPX_tau1)
-    QPX_x2 = VEC_MUL(QPX_x2, QPX_tau1)
-    QPX_x3 = VEC_MUL(QPX_x3, QPX_tau1)
-    QPX_x4 = VEC_MUL(QPX_x4, QPX_tau1)
-    QPX_tau2 = VEC_SPLATS(-hh(1,2))
-    QPX_s = VEC_SPLATS(-hh(1,2)*s)
-    QPX_y1 = VEC_MUL(QPX_y1, QPX_tau2)
-    QPX_y2 = VEC_MUL(QPX_y2, QPX_tau2)
-    QPX_y3 = VEC_MUL(QPX_y3, QPX_tau2)
-    QPX_y4 = VEC_MUL(QPX_y4, QPX_tau2)
-    QPX_y1 = VEC_MADD(QPX_x1, QPX_s, QPX_y1)
-    QPX_y2 = VEC_MADD(QPX_x2, QPX_s, QPX_y2)
-    QPX_y3 = VEC_MADD(QPX_x3, QPX_s, QPX_y3)
-    QPX_y4 = VEC_MADD(QPX_x4, QPX_s, QPX_y4)
-
-    !--- rank-2 update of q ---
-
-    QPX_q1 = VEC_LD(0,q(1,1))
-    QPX_q2 = VEC_LD(0,q(5,1))
-    QPX_q3 = VEC_LD(0,q(9,1))
-    QPX_q4 = VEC_LD(0,q(13,1))
-    QPX_q1 = VEC_ADD(QPX_q1, QPX_y1)
-    QPX_q2 = VEC_ADD(QPX_q2, QPX_y2)
-    QPX_q3 = VEC_ADD(QPX_q3, QPX_y3)
-    QPX_q4 = VEC_ADD(QPX_q4, QPX_y4)
-    call VEC_ST(QPX_q1, 0, q(1,1))
-    call VEC_ST(QPX_q2, 0, q(5,1))
-    call VEC_ST(QPX_q3, 0, q(9,1))
-    call VEC_ST(QPX_q4, 0, q(13,1))
-
-    QPX_h2 = VEC_SPLATS(hh(2,2))
-    QPX_q1 = VEC_LD(0,q(1,2))
-    QPX_q2 = VEC_LD(0,q(5,2))
-    QPX_q3 = VEC_LD(0,q(9,2))
-    QPX_q4 = VEC_LD(0,q(13,2))
-    QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
-    QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2)
-    QPX_q3 = VEC_MADD(QPX_y3, QPX_h2, QPX_q3)
-    QPX_q4 = VEC_MADD(QPX_y4, QPX_h2, QPX_q4)
-    QPX_q1 = VEC_ADD(QPX_q1, QPX_x1)
-    QPX_q2 = VEC_ADD(QPX_q2, QPX_x2)
-    QPX_q3 = VEC_ADD(QPX_q3, QPX_x3)
-    QPX_q4 = VEC_ADD(QPX_q4, QPX_x4)
-    call VEC_ST(QPX_q1, 0, q(1,2))
-    call VEC_ST(QPX_q2, 0, q(5,2))
-    call VEC_ST(QPX_q3, 0, q(9,2))
-    call VEC_ST(QPX_q4, 0, q(13,2))
-
-    do i=3,nb,1
-
-       QPX_q1 = VEC_LD(0,q(1,i))
-       QPX_q2 = VEC_LD(0,q(5,i))
-       QPX_q3 = VEC_LD(0,q(9,i))
-       QPX_q4 = VEC_LD(0,q(13,i))
-       QPX_h1 = VEC_SPLATS(hh(i-1,1))
-       QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
-       QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2)
-       QPX_q3 = VEC_MADD(QPX_x3, QPX_h1, QPX_q3)
-       QPX_q4 = VEC_MADD(QPX_x4, QPX_h1, QPX_q4)
-       QPX_h2 = VEC_SPLATS(hh(i,2))
-       QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
-       QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2)
-       QPX_q3 = VEC_MADD(QPX_y3, QPX_h2, QPX_q3)
-       QPX_q4 = VEC_MADD(QPX_y4, QPX_h2, QPX_q4)
-
-       call VEC_ST(QPX_q1, 0, q(1,i))
-       call VEC_ST(QPX_q2, 0, q(5,i))
-       call VEC_ST(QPX_q3, 0, q(9,i))
-       call VEC_ST(QPX_q4, 0, q(13,i))
-
-    enddo
-
-    QPX_h1 = VEC_SPLATS(hh(nb,1))
-    QPX_q1 = VEC_LD(0,q(1,nb+1))
-    QPX_q2 = VEC_LD(0,q(5,nb+1))
-    QPX_q3 = VEC_LD(0,q(9,nb+1))
-    QPX_q4 = VEC_LD(0,q(13,nb+1))
-    QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
-    QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2)
-    QPX_q3 = VEC_MADD(QPX_x3, QPX_h1, QPX_q3)
-    QPX_q4 = VEC_MADD(QPX_x4, QPX_h1, QPX_q4)
-    call VEC_ST(QPX_q1, 0, q(1,nb+1))
-    call VEC_ST(QPX_q2, 0, q(5,nb+1))
-    call VEC_ST(QPX_q3, 0, q(9,nb+1))
-    call VEC_ST(QPX_q4, 0, q(13,nb+1))
-
-  end subroutine hh_trafo_kernel_16_bgq
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_8_bgq(q, hh, nb, ldq, ldh, s)
-    use precision
-    implicit none
-
-    include 'mpif.h'
-
-    integer(kind=ik), intent(in) :: nb, ldq, ldh
-
-    real(kind=rk), intent(inout) :: q(ldq,*)
-    real(kind=rk), intent(in)    :: hh(ldh,*), s
-    integer(kind=ik)             :: i
-    VECTOR(REAL(8))::QPX_x1, QPX_x2, QPX_y1, QPX_y2
-    VECTOR(REAL(8))::QPX_q1, QPX_q2
-    VECTOR(REAL(8))::QPX_h1, QPX_h2, QPX_tau1, QPX_tau2, QPX_s
-
-
-    call alignx(32,q)
-
-    !--- multiply Householder vectors with matrix q ---
-
-    QPX_x1 = VEC_LD(0,q(1,2))
-    QPX_x2 = VEC_LD(0,q(5,2))
-
-    QPX_h2 = VEC_SPLATS(hh(2,2))
-    QPX_q1 = VEC_LD(0,q(1,1))
-    QPX_q2 = VEC_LD(0,q(5,1))
-    QPX_y1 = VEC_MADD(QPX_x1, QPX_h2, QPX_q1)
-    QPX_y2 = VEC_MADD(QPX_x2, QPX_h2, QPX_q2)
-
-    do i=3,nb,1
-
-       QPX_q1 = VEC_LD(0,q(1,i))
-       QPX_q2 = VEC_LD(0,q(5,i))
-       QPX_h1 = VEC_SPLATS(hh(i-1,1))
-       QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
-       QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2)
-       QPX_h2 = VEC_SPLATS(hh(i,2))
-       QPX_y1 = VEC_MADD(QPX_q1, QPX_h2, QPX_y1)
-       QPX_y2 = VEC_MADD(QPX_q2, QPX_h2, QPX_y2)
-
-    enddo
-
-    QPX_h1 = VEC_SPLATS(hh(nb,1))
-    QPX_q1 = VEC_LD(0,q(1,nb+1))
-    QPX_q2 = VEC_LD(0,q(5,nb+1))
-    QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
-    QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2)
-
-    !--- multiply T matrix ---
-
-    QPX_tau1 = VEC_SPLATS(-hh(1,1))
-    QPX_x1 = VEC_MUL(QPX_x1, QPX_tau1)
-    QPX_x2 = VEC_MUL(QPX_x2, QPX_tau1)
-    QPX_tau2 = VEC_SPLATS(-hh(1,2))
-    QPX_s = VEC_SPLATS(-hh(1,2)*s)
-    QPX_y1 = VEC_MUL(QPX_y1, QPX_tau2)
-    QPX_y2 = VEC_MUL(QPX_y2, QPX_tau2)
-    QPX_y1 = VEC_MADD(QPX_x1, QPX_s, QPX_y1)
-    QPX_y2 = VEC_MADD(QPX_x2, QPX_s, QPX_y2)
-
-    !--- rank-2 update of q ---
-
-    QPX_q1 = VEC_LD(0,q(1,1))
-    QPX_q2 = VEC_LD(0,q(5,1))
-    QPX_q1 = VEC_ADD(QPX_q1, QPX_y1)
-    QPX_q2 = VEC_ADD(QPX_q2, QPX_y2)
-    call VEC_ST(QPX_q1, 0, q(1,1))
-    call VEC_ST(QPX_q2, 0, q(5,1))
-
-    QPX_h2 = VEC_SPLATS(hh(2,2))
-    QPX_q1 = VEC_LD(0,q(1,2))
-    QPX_q2 = VEC_LD(0,q(5,2))
-    QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
-    QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2)
-    QPX_q1 = VEC_ADD(QPX_q1, QPX_x1)
-    QPX_q2 = VEC_ADD(QPX_q2, QPX_x2)
-    call VEC_ST(QPX_q1, 0, q(1,2))
-    call VEC_ST(QPX_q2, 0, q(5,2))
-
-    do i=3,nb,1
-
-       QPX_q1 = VEC_LD(0,q(1,i))
-       QPX_q2 = VEC_LD(0,q(5,i))
-       QPX_h1 = VEC_SPLATS(hh(i-1,1))
-       QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
-       QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2)
-       QPX_h2 = VEC_SPLATS(hh(i,2))
-       QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
-       QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2)
-
-       call VEC_ST(QPX_q1, 0, q(1,i))
-       call VEC_ST(QPX_q2, 0, q(5,i))
-
-    enddo
-
-    QPX_h1 = VEC_SPLATS(hh(nb,1))
-    QPX_q1 = VEC_LD(0,q(1,nb+1))
-    QPX_q2 = VEC_LD(0,q(5,nb+1))
-    QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
-    QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2)
-    call VEC_ST(QPX_q1, 0, q(1,nb+1))
-    call VEC_ST(QPX_q2, 0, q(5,nb+1))
-
-  end subroutine hh_trafo_kernel_8_bgq
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_4_bgq(q, hh, nb, ldq, ldh, s)
-    use precision
-    implicit none
-
-    include 'mpif.h'
-
-    integer(kind=ik), intent(in) :: nb, ldq, ldh
-
-    real(kind=rk), intent(inout) :: q(ldq,*)
-    real(kind=rk), intent(in)    :: hh(ldh,*), s
-    integer(kind=ik)             :: i
-    VECTOR(REAL(8))::QPX_x1, QPX_y1
-    VECTOR(REAL(8))::QPX_q1
-    VECTOR(REAL(8))::QPX_h1, QPX_h2, QPX_tau1, QPX_tau2, QPX_s
-
-    call alignx(32,q)
-
-    !--- multiply Householder vectors with matrix q ---
-
-    QPX_x1 = VEC_LD(0,q(1,2))
-
-    QPX_h2 = VEC_SPLATS(hh(2,2))
-    QPX_q1 = VEC_LD(0,q(1,1))
-    QPX_y1 = VEC_MADD(QPX_x1, QPX_h2, QPX_q1)
-
-    do i=3,nb,1
-
-       QPX_q1 = VEC_LD(0,q(1,i))
-       QPX_h1 = VEC_SPLATS(hh(i-1,1))
-       QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
-       QPX_h2 = VEC_SPLATS(hh(i,2))
-       QPX_y1 = VEC_MADD(QPX_q1, QPX_h2, QPX_y1)
-
-    enddo
-
-    QPX_h1 = VEC_SPLATS(hh(nb,1))
-    QPX_q1 = VEC_LD(0,q(1,nb+1))
-    QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
-
-    !--- multiply T matrix ---
-
-    QPX_tau1 = VEC_SPLATS(-hh(1,1))
-    QPX_x1 = VEC_MUL(QPX_x1, QPX_tau1)
-    QPX_tau2 = VEC_SPLATS(-hh(1,2))
-    QPX_s = VEC_SPLATS(-hh(1,2)*s)
-    QPX_y1 = VEC_MUL(QPX_y1, QPX_tau2)
-    QPX_y1 = VEC_MADD(QPX_x1, QPX_s, QPX_y1)
-
-    !--- rank-2 update of q ---
-
-    QPX_q1 = VEC_LD(0,q(1,1))
-    QPX_q1 = VEC_ADD(QPX_q1, QPX_y1)
-    call VEC_ST(QPX_q1, 0, q(1,1))
-
-    QPX_h2 = VEC_SPLATS(hh(2,2))
-    QPX_q1 = VEC_LD(0,q(1,2))
-    QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
-    QPX_q1 = VEC_ADD(QPX_q1, QPX_x1)
-    call VEC_ST(QPX_q1, 0, q(1,2))
-
-    do i=3,nb,1
-
-       QPX_q1 = VEC_LD(0,q(1,i))
-       QPX_h1 = VEC_SPLATS(hh(i-1,1))
-       QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
-       QPX_h2 = VEC_SPLATS(hh(i,2))
-       QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
-
-       call VEC_ST(QPX_q1, 0, q(1,i))
-
-    enddo
-
-    QPX_h1 = VEC_SPLATS(hh(nb,1))
-    QPX_q1 = VEC_LD(0,q(1,nb+1))
-    QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
-    call VEC_ST(QPX_q1, 0, q(1,nb+1))
-
-  end subroutine hh_trafo_kernel_4_bgq
-end module real_bgq_kernel
-! --------------------------------------------------------------------------------------------------
diff --git a/src/elpa2_kernels/elpa2_kernels_real_simple.F90 b/src/elpa2_kernels/elpa2_kernels_real_simple.F90
deleted file mode 100644
index 334958930..000000000
--- a/src/elpa2_kernels/elpa2_kernels_real_simple.F90
+++ /dev/null
@@ -1,136 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! --------------------------------------------------------------------------------------------------
-!
-! This file contains the compute intensive kernels for the Householder transformations.
-!
-! This is the small and simple version (no hand unrolling of loops etc.) but for some
-! compilers this performs better than a sophisticated version with transformed and unrolled loops.
-!
-! It should be compiled with the highest possible optimization level.
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-! --------------------------------------------------------------------------------------------------
-
-#include "config-f90.h"
-
-module real_generic_simple_kernel
-
-  private
-  public double_hh_trafo_generic_simple
-contains
-  subroutine double_hh_trafo_generic_simple(q, hh, nb, nq, ldq, ldh)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    real(kind=rk), intent(inout) :: q(ldq,*)
-    real(kind=rk), intent(in)    :: hh(ldh,*)
-#else
-    real(kind=rk), intent(inout) :: q(ldq,1:nb+1)
-    real(kind=rk), intent(in)    :: hh(ldh,2)
-#endif
-
-    real(kind=rk)                :: s, h1, h2, tau1, tau2, x(nq), y(nq)
-    integer(kind=ik)             :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%start("kernel generic simple: double_hh_trafo_generic_simple")
-#endif
-    ! Calculate dot product of the two Householder vectors
-
-    s = hh(2,2)*1
-    do i=3,nb
-       s = s+hh(i,2)*hh(i-1,1)
-    enddo
-
-    ! Do the Householder transformations
-
-    x(1:nq) = q(1:nq,2)
-
-    y(1:nq) = q(1:nq,1) + q(1:nq,2)*hh(2,2)
-
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       x(1:nq) = x(1:nq) + q(1:nq,i)*h1
-       y(1:nq) = y(1:nq) + q(1:nq,i)*h2
-    enddo
-
-    x(1:nq) = x(1:nq) + q(1:nq,nb+1)*hh(nb,1)
-
-    tau1 = hh(1,1)
-    tau2 = hh(1,2)
-
-    h1 = -tau1
-    x(1:nq) = x(1:nq)*h1
-    h1 = -tau2
-    h2 = -tau2*s
-    y(1:nq) = y(1:nq)*h1 + x(1:nq)*h2
-
-    q(1:nq,1) = q(1:nq,1) + y(1:nq)
-    q(1:nq,2) = q(1:nq,2) + x(1:nq) + y(1:nq)*hh(2,2)
-
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       q(1:nq,i) = q(1:nq,i) + x(1:nq)*h1 + y(1:nq)*h2
-    enddo
-
-    q(1:nq,nb+1) = q(1:nq,nb+1) + x(1:nq)*hh(nb,1)
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%stop("kernel generic simple: double_hh_trafo_generic_simple")
-#endif
-
-  end subroutine double_hh_trafo_generic_simple
-end module real_generic_simple_kernel
-! --------------------------------------------------------------------------------------------------
diff --git a/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c b/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
deleted file mode 100644
index 6dad5aa61..000000000
--- a/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
+++ /dev/null
@@ -1,1718 +0,0 @@
-//    This file is part of ELPA.
-//
-//    The ELPA library was originally created by the ELPA consortium,
-//    consisting of the following organizations:
-//
-//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-//      Informatik,
-//    - Technische Universität München, Lehrstuhl für Informatik mit
-//      Schwerpunkt Wissenschaftliches Rechnen ,
-//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-//      and
-//    - IBM Deutschland GmbH
-//
-//    This particular source code file contains additions, changes and
-//    enhancements authored by Intel Corporation which is not part of
-//    the ELPA consortium.
-//
-//    More information can be found here:
-//    http://elpa.mpcdf.mpg.de/
-//
-//    ELPA is free software: you can redistribute it and/or modify
-//    it under the terms of the version 3 of the license of the
-//    GNU Lesser General Public License as published by the Free
-//    Software Foundation.
-//
-//    ELPA is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU Lesser General Public License for more details.
-//
-//    You should have received a copy of the GNU Lesser General Public License
-//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-//
-//    ELPA reflects a substantial effort on the part of the original
-//    ELPA consortium, and we ask you to respect the spirit of the
-//    license that we chose: i.e., please contribute any changes you
-//    may have back to the original ELPA library distribution, and keep
-//    any derivatives of ELPA under the same license that we chose for
-//    the original distribution, the GNU Lesser General Public License.
-//
-//
-// --------------------------------------------------------------------------------------------------
-//
-// This file contains the compute intensive kernels for the Householder transformations.
-// It should be compiled with the highest possible optimization level.
-//
-// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
-// On Intel Sandy Bridge use -O3 -mavx
-//
-// Copyright of the original code rests with the authors inside the ELPA
-// consortium. The copyright of any additional modifications shall rest
-// with their original authors, but shall adhere to the licensing terms
-// distributed along with the original code in the file "COPYING".
-//
-// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
-// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
-// --------------------------------------------------------------------------------------------------
-
-#include <x86intrin.h>
-
-#define __forceinline __attribute__((always_inline)) static
-
-
-#ifdef __USE_AVX128__
-#undef __AVX__
-#endif
-
-#ifdef __FMA4__
-#define __ELPA_USE_FMA__
-#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
-#endif
-
-#ifdef __AVX2__
-#define __ELPA_USE_FMA__
-#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
-#endif
-
-//Forward declaration
-#ifdef __AVX__
-__forceinline void hh_trafo_kernel_4_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
-__forceinline void hh_trafo_kernel_8_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
-__forceinline void hh_trafo_kernel_16_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
-__forceinline void hh_trafo_kernel_24_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
-#else
-__forceinline void hh_trafo_kernel_4_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
-__forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
-__forceinline void hh_trafo_kernel_12_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
-#endif
-
-void double_hh_trafo_real_sse_avx_2hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
-#if 0
-void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
-#endif
-
-void double_hh_trafo_real_sse_avx_2hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	int ldh = *pldh;
-
-	// calculating scalar product to compute
-	// 2 householder vectors simultaneously
-	double s = hh[(ldh)+1]*1.0;
-
-	#pragma ivdep
-	for (i = 2; i < nb; i++)
-	{
-		s += hh[i-1] * hh[(i+ldh)];
-	}
-
-	// Production level kernel calls with padding
-#ifdef __AVX__
-	for (i = 0; i < nq-20; i+=24)
-	{
-		hh_trafo_kernel_24_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-
-	if (nq == i)
-	{
-		return;
-	}
-
-	if (nq-i == 20)
-	{
-		hh_trafo_kernel_16_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-		hh_trafo_kernel_4_AVX_2hv(&q[i+16], hh, nb, ldq, ldh, s);
-	}
-	else if (nq-i == 16)
-	{
-		hh_trafo_kernel_16_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-	else if (nq-i == 12)
-	{
-		hh_trafo_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-		hh_trafo_kernel_4_AVX_2hv(&q[i+8], hh, nb, ldq, ldh, s);
-	}
-	else if (nq-i == 8)
-	{
-		hh_trafo_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-	else
-	{
-		hh_trafo_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-#else
-	for (i = 0; i < nq-8; i+=12)
-	{
-		hh_trafo_kernel_12_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-	if (nq == i)
-	{
-		return;
-	}
-	else
-	{
-		if (nq-i > 4)
-		{
-			hh_trafo_kernel_8_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
-		}
-		else if (nq-i > 0)
-		{
-			hh_trafo_kernel_4_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
-		}
-	}
-#endif
-}
-
-#if 0
-void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	int ldh = *pldh;
-
-	// calculating scalar product to compute
-	// 2 householder vectors simultaneously
-	double s = hh[(ldh)+1]*1.0;
-
-	#pragma ivdep
-	for (i = 2; i < nb; i++)
-	{
-		s += hh[i-1] * hh[(i+ldh)];
-	}
-
-	// Production level kernel calls with padding
-#ifdef __AVX__
-	for (i = 0; i < nq; i+=24)
-	{
-		hh_trafo_kernel_24_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-#else
-	for (i = 0; i < nq; i+=12)
-	{
-		hh_trafo_kernel_12_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-#endif
-}
-#endif
-
-#ifdef __AVX__
-/**
- * Unrolled kernel that computes
- * 24 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 2 update is performed
- */
- __forceinline void hh_trafo_kernel_24_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [24 x nb+1] * hh
-	// hh contains two householder vectors, with offset 1
-	/////////////////////////////////////////////////////
-	int i;
-	// Needed bit mask for floating point sign flip
-	__m256d sign = (__m256d)_mm256_set1_epi64x(0x8000000000000000);
-
-	__m256d x1 = _mm256_load_pd(&q[ldq]);
-	__m256d x2 = _mm256_load_pd(&q[ldq+4]);
-	__m256d x3 = _mm256_load_pd(&q[ldq+8]);
-	__m256d x4 = _mm256_load_pd(&q[ldq+12]);
-	__m256d x5 = _mm256_load_pd(&q[ldq+16]);
-	__m256d x6 = _mm256_load_pd(&q[ldq+20]);
-
-	__m256d h1 = _mm256_broadcast_sd(&hh[ldh+1]);
-	__m256d h2;
-
-#ifdef __ELPA_USE_FMA__
-	__m256d q1 = _mm256_load_pd(q);
-	__m256d y1 = _mm256_FMA_pd(x1, h1, q1);
-	__m256d q2 = _mm256_load_pd(&q[4]);
-	__m256d y2 = _mm256_FMA_pd(x2, h1, q2);
-	__m256d q3 = _mm256_load_pd(&q[8]);
-	__m256d y3 = _mm256_FMA_pd(x3, h1, q3);
-	__m256d q4 = _mm256_load_pd(&q[12]);
-	__m256d y4 = _mm256_FMA_pd(x4, h1, q4);
-	__m256d q5 = _mm256_load_pd(&q[16]);
-	__m256d y5 = _mm256_FMA_pd(x5, h1, q5);
-	__m256d q6 = _mm256_load_pd(&q[20]);
-	__m256d y6 = _mm256_FMA_pd(x6, h1, q6);
-#else
-	__m256d q1 = _mm256_load_pd(q);
-	__m256d y1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
-	__m256d q2 = _mm256_load_pd(&q[4]);
-	__m256d y2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
-	__m256d q3 = _mm256_load_pd(&q[8]);
-	__m256d y3 = _mm256_add_pd(q3, _mm256_mul_pd(x3, h1));
-	__m256d q4 = _mm256_load_pd(&q[12]);
-	__m256d y4 = _mm256_add_pd(q4, _mm256_mul_pd(x4, h1));
-	__m256d q5 = _mm256_load_pd(&q[16]);
-	__m256d y5 = _mm256_add_pd(q5, _mm256_mul_pd(x5, h1));
-	__m256d q6 = _mm256_load_pd(&q[20]);
-	__m256d y6 = _mm256_add_pd(q6, _mm256_mul_pd(x6, h1));
-#endif
-
-	for(i = 2; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-1]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		x1 = _mm256_FMA_pd(q1, h1, x1);
-		y1 = _mm256_FMA_pd(q1, h2, y1);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		x2 = _mm256_FMA_pd(q2, h1, x2);
-		y2 = _mm256_FMA_pd(q2, h2, y2);
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-		x3 = _mm256_FMA_pd(q3, h1, x3);
-		y3 = _mm256_FMA_pd(q3, h2, y3);
-		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
-		x4 = _mm256_FMA_pd(q4, h1, x4);
-		y4 = _mm256_FMA_pd(q4, h2, y4);
-		q5 = _mm256_load_pd(&q[(i*ldq)+16]);
-		x5 = _mm256_FMA_pd(q5, h1, x5);
-		y5 = _mm256_FMA_pd(q5, h2, y5);
-		q6 = _mm256_load_pd(&q[(i*ldq)+20]);
-		x6 = _mm256_FMA_pd(q6, h1, x6);
-		y6 = _mm256_FMA_pd(q6, h2, y6);
-#else
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-		y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-		x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
-		y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2));
-		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
-		x4 = _mm256_add_pd(x4, _mm256_mul_pd(q4,h1));
-		y4 = _mm256_add_pd(y4, _mm256_mul_pd(q4,h2));
-		q5 = _mm256_load_pd(&q[(i*ldq)+16]);
-		x5 = _mm256_add_pd(x5, _mm256_mul_pd(q5,h1));
-		y5 = _mm256_add_pd(y5, _mm256_mul_pd(q5,h2));
-		q6 = _mm256_load_pd(&q[(i*ldq)+20]);
-		x6 = _mm256_add_pd(x6, _mm256_mul_pd(q6,h1));
-		y6 = _mm256_add_pd(y6, _mm256_mul_pd(q6,h2));
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-	x3 = _mm256_FMA_pd(q3, h1, x3);
-	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
-	x4 = _mm256_FMA_pd(q4, h1, x4);
-	q5 = _mm256_load_pd(&q[(nb*ldq)+16]);
-	x5 = _mm256_FMA_pd(q5, h1, x5);
-	q6 = _mm256_load_pd(&q[(nb*ldq)+20]);
-	x6 = _mm256_FMA_pd(q6, h1, x6);
-#else
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-	x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
-	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
-	x4 = _mm256_add_pd(x4, _mm256_mul_pd(q4,h1));
-	q5 = _mm256_load_pd(&q[(nb*ldq)+16]);
-	x5 = _mm256_add_pd(x5, _mm256_mul_pd(q5,h1));
-	q6 = _mm256_load_pd(&q[(nb*ldq)+20]);
-	x6 = _mm256_add_pd(x6, _mm256_mul_pd(q6,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-2 update of Q [24 x nb+1]
-	/////////////////////////////////////////////////////
-
-	__m256d tau1 = _mm256_broadcast_sd(hh);
-	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
-	__m256d vs = _mm256_broadcast_sd(&s);
-
-	h1 = _mm256_xor_pd(tau1, sign);
-	x1 = _mm256_mul_pd(x1, h1);
-	x2 = _mm256_mul_pd(x2, h1);
-	x3 = _mm256_mul_pd(x3, h1);
-	x4 = _mm256_mul_pd(x4, h1);
-	x5 = _mm256_mul_pd(x5, h1);
-	x6 = _mm256_mul_pd(x6, h1);
-	h1 = _mm256_xor_pd(tau2, sign);
-	h2 = _mm256_mul_pd(h1, vs);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(y1, h1, _mm256_mul_pd(x1,h2));
-	y2 = _mm256_FMA_pd(y2, h1, _mm256_mul_pd(x2,h2));
-	y3 = _mm256_FMA_pd(y3, h1, _mm256_mul_pd(x3,h2));
-	y4 = _mm256_FMA_pd(y4, h1, _mm256_mul_pd(x4,h2));
-	y5 = _mm256_FMA_pd(y5, h1, _mm256_mul_pd(x5,h2));
-	y6 = _mm256_FMA_pd(y6, h1, _mm256_mul_pd(x6,h2));
-#else
-	y1 = _mm256_add_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
-	y2 = _mm256_add_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2));
-	y3 = _mm256_add_pd(_mm256_mul_pd(y3,h1), _mm256_mul_pd(x3,h2));
-	y4 = _mm256_add_pd(_mm256_mul_pd(y4,h1), _mm256_mul_pd(x4,h2));
-	y5 = _mm256_add_pd(_mm256_mul_pd(y5,h1), _mm256_mul_pd(x5,h2));
-	y6 = _mm256_add_pd(_mm256_mul_pd(y6,h1), _mm256_mul_pd(x6,h2));
-#endif
-
-	q1 = _mm256_load_pd(q);
-	q1 = _mm256_add_pd(q1, y1);
-	_mm256_store_pd(q,q1);
-	q2 = _mm256_load_pd(&q[4]);
-	q2 = _mm256_add_pd(q2, y2);
-	_mm256_store_pd(&q[4],q2);
-	q3 = _mm256_load_pd(&q[8]);
-	q3 = _mm256_add_pd(q3, y3);
-	_mm256_store_pd(&q[8],q3);
-	q4 = _mm256_load_pd(&q[12]);
-	q4 = _mm256_add_pd(q4, y4);
-	_mm256_store_pd(&q[12],q4);
-	q5 = _mm256_load_pd(&q[16]);
-	q5 = _mm256_add_pd(q5, y5);
-	_mm256_store_pd(&q[16],q5);
-	q6 = _mm256_load_pd(&q[20]);
-	q6 = _mm256_add_pd(q6, y6);
-	_mm256_store_pd(&q[20],q6);
-
-	h2 = _mm256_broadcast_sd(&hh[ldh+1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_FMA_pd(y1, h2, x1));
-	_mm256_store_pd(&q[ldq],q1);
-	q2 = _mm256_load_pd(&q[ldq+4]);
-	q2 = _mm256_add_pd(q2, _mm256_FMA_pd(y2, h2, x2));
-	_mm256_store_pd(&q[ldq+4],q2);
-	q3 = _mm256_load_pd(&q[ldq+8]);
-	q3 = _mm256_add_pd(q3, _mm256_FMA_pd(y3, h2, x3));
-	_mm256_store_pd(&q[ldq+8],q3);
-	q4 = _mm256_load_pd(&q[ldq+12]);
-	q4 = _mm256_add_pd(q4, _mm256_FMA_pd(y4, h2, x4));
-	_mm256_store_pd(&q[ldq+12],q4);
-	q5 = _mm256_load_pd(&q[ldq+16]);
-	q5 = _mm256_add_pd(q5, _mm256_FMA_pd(y5, h2, x5));
-	_mm256_store_pd(&q[ldq+16],q5);
-	q6 = _mm256_load_pd(&q[ldq+20]);
-	q6 = _mm256_add_pd(q6, _mm256_FMA_pd(y6, h2, x6));
-	_mm256_store_pd(&q[ldq+20],q6);
-#else
-	q1 = _mm256_load_pd(&q[ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_add_pd(x1, _mm256_mul_pd(y1, h2)));
-	_mm256_store_pd(&q[ldq],q1);
-	q2 = _mm256_load_pd(&q[ldq+4]);
-	q2 = _mm256_add_pd(q2, _mm256_add_pd(x2, _mm256_mul_pd(y2, h2)));
-	_mm256_store_pd(&q[ldq+4],q2);
-	q3 = _mm256_load_pd(&q[ldq+8]);
-	q3 = _mm256_add_pd(q3, _mm256_add_pd(x3, _mm256_mul_pd(y3, h2)));
-	_mm256_store_pd(&q[ldq+8],q3);
-	q4 = _mm256_load_pd(&q[ldq+12]);
-	q4 = _mm256_add_pd(q4, _mm256_add_pd(x4, _mm256_mul_pd(y4, h2)));
-	_mm256_store_pd(&q[ldq+12],q4);
-	q5 = _mm256_load_pd(&q[ldq+16]);
-	q5 = _mm256_add_pd(q5, _mm256_add_pd(x5, _mm256_mul_pd(y5, h2)));
-	_mm256_store_pd(&q[ldq+16],q5);
-	q6 = _mm256_load_pd(&q[ldq+20]);
-	q6 = _mm256_add_pd(q6, _mm256_add_pd(x6, _mm256_mul_pd(y6, h2)));
-	_mm256_store_pd(&q[ldq+20],q6);
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-1]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q1 = _mm256_FMA_pd(x1, h1, q1);
-		q1 = _mm256_FMA_pd(y1, h2, q1);
-		_mm256_store_pd(&q[i*ldq],q1);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		q2 = _mm256_FMA_pd(x2, h1, q2);
-		q2 = _mm256_FMA_pd(y2, h2, q2);
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-		q3 = _mm256_FMA_pd(x3, h1, q3);
-		q3 = _mm256_FMA_pd(y3, h2, q3);
-		_mm256_store_pd(&q[(i*ldq)+8],q3);
-		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
-		q4 = _mm256_FMA_pd(x4, h1, q4);
-		q4 = _mm256_FMA_pd(y4, h2, q4);
-		_mm256_store_pd(&q[(i*ldq)+12],q4);
-		q5 = _mm256_load_pd(&q[(i*ldq)+16]);
-		q5 = _mm256_FMA_pd(x5, h1, q5);
-		q5 = _mm256_FMA_pd(y5, h2, q5);
-		_mm256_store_pd(&q[(i*ldq)+16],q5);
-		q6 = _mm256_load_pd(&q[(i*ldq)+20]);
-		q6 = _mm256_FMA_pd(x6, h1, q6);
-		q6 = _mm256_FMA_pd(y6, h2, q6);
-		_mm256_store_pd(&q[(i*ldq)+20],q6);
-#else
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q1 = _mm256_add_pd(q1, _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2)));
-		_mm256_store_pd(&q[i*ldq],q1);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		q2 = _mm256_add_pd(q2, _mm256_add_pd(_mm256_mul_pd(x2,h1), _mm256_mul_pd(y2, h2)));
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-		q3 = _mm256_add_pd(q3, _mm256_add_pd(_mm256_mul_pd(x3,h1), _mm256_mul_pd(y3, h2)));
-		_mm256_store_pd(&q[(i*ldq)+8],q3);
-		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
-		q4 = _mm256_add_pd(q4, _mm256_add_pd(_mm256_mul_pd(x4,h1), _mm256_mul_pd(y4, h2)));
-		_mm256_store_pd(&q[(i*ldq)+12],q4);
-		q5 = _mm256_load_pd(&q[(i*ldq)+16]);
-		q5 = _mm256_add_pd(q5, _mm256_add_pd(_mm256_mul_pd(x5,h1), _mm256_mul_pd(y5, h2)));
-		_mm256_store_pd(&q[(i*ldq)+16],q5);
-		q6 = _mm256_load_pd(&q[(i*ldq)+20]);
-		q6 = _mm256_add_pd(q6, _mm256_add_pd(_mm256_mul_pd(x6,h1), _mm256_mul_pd(y6, h2)));
-		_mm256_store_pd(&q[(i*ldq)+20],q6);
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q1 = _mm256_FMA_pd(x1, h1, q1);
-	_mm256_store_pd(&q[nb*ldq],q1);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	q2 = _mm256_FMA_pd(x2, h1, q2);
-	_mm256_store_pd(&q[(nb*ldq)+4],q2);
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-	q3 = _mm256_FMA_pd(x3, h1, q3);
-	_mm256_store_pd(&q[(nb*ldq)+8],q3);
-	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
-	q4 = _mm256_FMA_pd(x4, h1, q4);
-	_mm256_store_pd(&q[(nb*ldq)+12],q4);
-	q5 = _mm256_load_pd(&q[(nb*ldq)+16]);
-	q5 = _mm256_FMA_pd(x5, h1, q5);
-	_mm256_store_pd(&q[(nb*ldq)+16],q5);
-	q6 = _mm256_load_pd(&q[(nb*ldq)+20]);
-	q6 = _mm256_FMA_pd(x6, h1, q6);
-	_mm256_store_pd(&q[(nb*ldq)+20],q6);
-#else
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
-	_mm256_store_pd(&q[nb*ldq],q1);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	q2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
-	_mm256_store_pd(&q[(nb*ldq)+4],q2);
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-	q3 = _mm256_add_pd(q3, _mm256_mul_pd(x3, h1));
-	_mm256_store_pd(&q[(nb*ldq)+8],q3);
-	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
-	q4 = _mm256_add_pd(q4, _mm256_mul_pd(x4, h1));
-	_mm256_store_pd(&q[(nb*ldq)+12],q4);
-	q5 = _mm256_load_pd(&q[(nb*ldq)+16]);
-	q5 = _mm256_add_pd(q5, _mm256_mul_pd(x5, h1));
-	_mm256_store_pd(&q[(nb*ldq)+16],q5);
-	q6 = _mm256_load_pd(&q[(nb*ldq)+20]);
-	q6 = _mm256_add_pd(q6, _mm256_mul_pd(x6, h1));
-	_mm256_store_pd(&q[(nb*ldq)+20],q6);
-#endif
-}
-
-/**
- * Unrolled kernel that computes
- * 16 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 2 update is performed
- */
- __forceinline void hh_trafo_kernel_16_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [16 x nb+1] * hh
-	// hh contains two householder vectors, with offset 1
-	/////////////////////////////////////////////////////
-	int i;
-	// Needed bit mask for floating point sign flip
-	__m256d sign = (__m256d)_mm256_set1_epi64x(0x8000000000000000);
-
-	__m256d x1 = _mm256_load_pd(&q[ldq]);
-	__m256d x2 = _mm256_load_pd(&q[ldq+4]);
-	__m256d x3 = _mm256_load_pd(&q[ldq+8]);
-	__m256d x4 = _mm256_load_pd(&q[ldq+12]);
-
-	__m256d h1 = _mm256_broadcast_sd(&hh[ldh+1]);
-	__m256d h2;
-
-#ifdef __ELPA_USE_FMA__
-	__m256d q1 = _mm256_load_pd(q);
-	__m256d y1 = _mm256_FMA_pd(x1, h1, q1);
-	__m256d q2 = _mm256_load_pd(&q[4]);
-	__m256d y2 = _mm256_FMA_pd(x2, h1, q2);
-	__m256d q3 = _mm256_load_pd(&q[8]);
-	__m256d y3 = _mm256_FMA_pd(x3, h1, q3);
-	__m256d q4 = _mm256_load_pd(&q[12]);
-	__m256d y4 = _mm256_FMA_pd(x4, h1, q4);
-#else
-	__m256d q1 = _mm256_load_pd(q);
-	__m256d y1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
-	__m256d q2 = _mm256_load_pd(&q[4]);
-	__m256d y2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
-	__m256d q3 = _mm256_load_pd(&q[8]);
-	__m256d y3 = _mm256_add_pd(q3, _mm256_mul_pd(x3, h1));
-	__m256d q4 = _mm256_load_pd(&q[12]);
-	__m256d y4 = _mm256_add_pd(q4, _mm256_mul_pd(x4, h1));
-#endif
-
-	for(i = 2; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-1]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		x1 = _mm256_FMA_pd(q1, h1, x1);
-		y1 = _mm256_FMA_pd(q1, h2, y1);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		x2 = _mm256_FMA_pd(q2, h1, x2);
-		y2 = _mm256_FMA_pd(q2, h2, y2);
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-		x3 = _mm256_FMA_pd(q3, h1, x3);
-		y3 = _mm256_FMA_pd(q3, h2, y3);
-		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
-		x4 = _mm256_FMA_pd(q4, h1, x4);
-		y4 = _mm256_FMA_pd(q4, h2, y4);
-#else
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-		y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-		x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
-		y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2));
-		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
-		x4 = _mm256_add_pd(x4, _mm256_mul_pd(q4,h1));
-		y4 = _mm256_add_pd(y4, _mm256_mul_pd(q4,h2));
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-	x3 = _mm256_FMA_pd(q3, h1, x3);
-	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
-	x4 = _mm256_FMA_pd(q4, h1, x4);
-#else
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-	x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
-	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
-	x4 = _mm256_add_pd(x4, _mm256_mul_pd(q4,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-2 update of Q [16 x nb+1]
-	/////////////////////////////////////////////////////
-
-	__m256d tau1 = _mm256_broadcast_sd(hh);
-	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
-	__m256d vs = _mm256_broadcast_sd(&s);
-
-	h1 = _mm256_xor_pd(tau1, sign);
-	x1 = _mm256_mul_pd(x1, h1);
-	x2 = _mm256_mul_pd(x2, h1);
-	x3 = _mm256_mul_pd(x3, h1);
-	x4 = _mm256_mul_pd(x4, h1);
-	h1 = _mm256_xor_pd(tau2, sign);
-	h2 = _mm256_mul_pd(h1, vs);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(y1, h1, _mm256_mul_pd(x1,h2));
-	y2 = _mm256_FMA_pd(y2, h1, _mm256_mul_pd(x2,h2));
-	y3 = _mm256_FMA_pd(y3, h1, _mm256_mul_pd(x3,h2));
-	y4 = _mm256_FMA_pd(y4, h1, _mm256_mul_pd(x4,h2));
-#else
-	y1 = _mm256_add_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
-	y2 = _mm256_add_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2));
-	y3 = _mm256_add_pd(_mm256_mul_pd(y3,h1), _mm256_mul_pd(x3,h2));
-	y4 = _mm256_add_pd(_mm256_mul_pd(y4,h1), _mm256_mul_pd(x4,h2));
-#endif
-
-	q1 = _mm256_load_pd(q);
-	q1 = _mm256_add_pd(q1, y1);
-	_mm256_store_pd(q,q1);
-	q2 = _mm256_load_pd(&q[4]);
-	q2 = _mm256_add_pd(q2, y2);
-	_mm256_store_pd(&q[4],q2);
-	q3 = _mm256_load_pd(&q[8]);
-	q3 = _mm256_add_pd(q3, y3);
-	_mm256_store_pd(&q[8],q3);
-	q4 = _mm256_load_pd(&q[12]);
-	q4 = _mm256_add_pd(q4, y4);
-	_mm256_store_pd(&q[12],q4);
-
-	h2 = _mm256_broadcast_sd(&hh[ldh+1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_FMA_pd(y1, h2, x1));
-	_mm256_store_pd(&q[ldq],q1);
-	q2 = _mm256_load_pd(&q[ldq+4]);
-	q2 = _mm256_add_pd(q2, _mm256_FMA_pd(y2, h2, x2));
-	_mm256_store_pd(&q[ldq+4],q2);
-	q3 = _mm256_load_pd(&q[ldq+8]);
-	q3 = _mm256_add_pd(q3, _mm256_FMA_pd(y3, h2, x3));
-	_mm256_store_pd(&q[ldq+8],q3);
-	q4 = _mm256_load_pd(&q[ldq+12]);
-	q4 = _mm256_add_pd(q4, _mm256_FMA_pd(y4, h2, x4));
-	_mm256_store_pd(&q[ldq+12],q4);
-#else
-	q1 = _mm256_load_pd(&q[ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_add_pd(x1, _mm256_mul_pd(y1, h2)));
-	_mm256_store_pd(&q[ldq],q1);
-	q2 = _mm256_load_pd(&q[ldq+4]);
-	q2 = _mm256_add_pd(q2, _mm256_add_pd(x2, _mm256_mul_pd(y2, h2)));
-	_mm256_store_pd(&q[ldq+4],q2);
-	q3 = _mm256_load_pd(&q[ldq+8]);
-	q3 = _mm256_add_pd(q3, _mm256_add_pd(x3, _mm256_mul_pd(y3, h2)));
-	_mm256_store_pd(&q[ldq+8],q3);
-	q4 = _mm256_load_pd(&q[ldq+12]);
-	q4 = _mm256_add_pd(q4, _mm256_add_pd(x4, _mm256_mul_pd(y4, h2)));
-	_mm256_store_pd(&q[ldq+12],q4);
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-1]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q1 = _mm256_FMA_pd(x1, h1, q1);
-		q1 = _mm256_FMA_pd(y1, h2, q1);
-		_mm256_store_pd(&q[i*ldq],q1);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		q2 = _mm256_FMA_pd(x2, h1, q2);
-		q2 = _mm256_FMA_pd(y2, h2, q2);
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-		q3 = _mm256_FMA_pd(x3, h1, q3);
-		q3 = _mm256_FMA_pd(y3, h2, q3);
-		_mm256_store_pd(&q[(i*ldq)+8],q3);
-		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
-		q4 = _mm256_FMA_pd(x4, h1, q4);
-		q4 = _mm256_FMA_pd(y4, h2, q4);
-		_mm256_store_pd(&q[(i*ldq)+12],q4);
-#else
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q1 = _mm256_add_pd(q1, _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2)));
-		_mm256_store_pd(&q[i*ldq],q1);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		q2 = _mm256_add_pd(q2, _mm256_add_pd(_mm256_mul_pd(x2,h1), _mm256_mul_pd(y2, h2)));
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-		q3 = _mm256_add_pd(q3, _mm256_add_pd(_mm256_mul_pd(x3,h1), _mm256_mul_pd(y3, h2)));
-		_mm256_store_pd(&q[(i*ldq)+8],q3);
-		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
-		q4 = _mm256_add_pd(q4, _mm256_add_pd(_mm256_mul_pd(x4,h1), _mm256_mul_pd(y4, h2)));
-		_mm256_store_pd(&q[(i*ldq)+12],q4);
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q1 = _mm256_FMA_pd(x1, h1, q1);
-	_mm256_store_pd(&q[nb*ldq],q1);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	q2 = _mm256_FMA_pd(x2, h1, q2);
-	_mm256_store_pd(&q[(nb*ldq)+4],q2);
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-	q3 = _mm256_FMA_pd(x3, h1, q3);
-	_mm256_store_pd(&q[(nb*ldq)+8],q3);
-	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
-	q4 = _mm256_FMA_pd(x4, h1, q4);
-	_mm256_store_pd(&q[(nb*ldq)+12],q4);
-#else
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
-	_mm256_store_pd(&q[nb*ldq],q1);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	q2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
-	_mm256_store_pd(&q[(nb*ldq)+4],q2);
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-	q3 = _mm256_add_pd(q3, _mm256_mul_pd(x3, h1));
-	_mm256_store_pd(&q[(nb*ldq)+8],q3);
-	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
-	q4 = _mm256_add_pd(q4, _mm256_mul_pd(x4, h1));
-	_mm256_store_pd(&q[(nb*ldq)+12],q4);
-#endif
-}
-
-/**
- * Unrolled kernel that computes
- * 8 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 2 update is performed
- */
- __forceinline void hh_trafo_kernel_8_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [8 x nb+1] * hh
-	// hh contains two householder vectors, with offset 1
-	/////////////////////////////////////////////////////
-	int i;
-	// Needed bit mask for floating point sign flip
-	__m256d sign = (__m256d)_mm256_set1_epi64x(0x8000000000000000);
-
-	__m256d x1 = _mm256_load_pd(&q[ldq]);
-	__m256d x2 = _mm256_load_pd(&q[ldq+4]);
-
-	__m256d h1 = _mm256_broadcast_sd(&hh[ldh+1]);
-	__m256d h2;
-
-#ifdef __ELPA_USE_FMA__
-	__m256d q1 = _mm256_load_pd(q);
-	__m256d y1 = _mm256_FMA_pd(x1, h1, q1);
-	__m256d q2 = _mm256_load_pd(&q[4]);
-	__m256d y2 = _mm256_FMA_pd(x2, h1, q2);
-#else
-	__m256d q1 = _mm256_load_pd(q);
-	__m256d y1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
-	__m256d q2 = _mm256_load_pd(&q[4]);
-	__m256d y2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
-#endif
-
-	for(i = 2; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-1]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		x1 = _mm256_FMA_pd(q1, h1, x1);
-		y1 = _mm256_FMA_pd(q1, h2, y1);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		x2 = _mm256_FMA_pd(q2, h1, x2);
-		y2 = _mm256_FMA_pd(q2, h2, y2);
-#else
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-		y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-#else
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-2 update of Q [8 x nb+1]
-	/////////////////////////////////////////////////////
-
-	__m256d tau1 = _mm256_broadcast_sd(hh);
-	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
-	__m256d vs = _mm256_broadcast_sd(&s);
-
-	h1 = _mm256_xor_pd(tau1, sign);
-	x1 = _mm256_mul_pd(x1, h1);
-	x2 = _mm256_mul_pd(x2, h1);
-	h1 = _mm256_xor_pd(tau2, sign);
-	h2 = _mm256_mul_pd(h1, vs);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(y1, h1, _mm256_mul_pd(x1,h2));
-	y2 = _mm256_FMA_pd(y2, h1, _mm256_mul_pd(x2,h2));
-#else
-	y1 = _mm256_add_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
-	y2 = _mm256_add_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2));
-#endif
-
-	q1 = _mm256_load_pd(q);
-	q1 = _mm256_add_pd(q1, y1);
-	_mm256_store_pd(q,q1);
-	q2 = _mm256_load_pd(&q[4]);
-	q2 = _mm256_add_pd(q2, y2);
-	_mm256_store_pd(&q[4],q2);
-
-	h2 = _mm256_broadcast_sd(&hh[ldh+1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_FMA_pd(y1, h2, x1));
-	_mm256_store_pd(&q[ldq],q1);
-	q2 = _mm256_load_pd(&q[ldq+4]);
-	q2 = _mm256_add_pd(q2, _mm256_FMA_pd(y2, h2, x2));
-	_mm256_store_pd(&q[ldq+4],q2);
-#else
-	q1 = _mm256_load_pd(&q[ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_add_pd(x1, _mm256_mul_pd(y1, h2)));
-	_mm256_store_pd(&q[ldq],q1);
-	q2 = _mm256_load_pd(&q[ldq+4]);
-	q2 = _mm256_add_pd(q2, _mm256_add_pd(x2, _mm256_mul_pd(y2, h2)));
-	_mm256_store_pd(&q[ldq+4],q2);
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-1]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q1 = _mm256_FMA_pd(x1, h1, q1);
-		q1 = _mm256_FMA_pd(y1, h2, q1);
-		_mm256_store_pd(&q[i*ldq],q1);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		q2 = _mm256_FMA_pd(x2, h1, q2);
-		q2 = _mm256_FMA_pd(y2, h2, q2);
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-#else
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q1 = _mm256_add_pd(q1, _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2)));
-		_mm256_store_pd(&q[i*ldq],q1);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		q2 = _mm256_add_pd(q2, _mm256_add_pd(_mm256_mul_pd(x2,h1), _mm256_mul_pd(y2, h2)));
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q1 = _mm256_FMA_pd(x1, h1, q1);
-	_mm256_store_pd(&q[nb*ldq],q1);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	q2 = _mm256_FMA_pd(x2, h1, q2);
-	_mm256_store_pd(&q[(nb*ldq)+4],q2);
-#else
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
-	_mm256_store_pd(&q[nb*ldq],q1);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	q2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
-	_mm256_store_pd(&q[(nb*ldq)+4],q2);
-#endif
-}
-
-/**
- * Unrolled kernel that computes
- * 4 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 2 update is performed
- */
- __forceinline void hh_trafo_kernel_4_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [4 x nb+1] * hh
-	// hh contains two householder vectors, with offset 1
-	/////////////////////////////////////////////////////
-	int i;
-	// Needed bit mask for floating point sign flip
-	__m256d sign = (__m256d)_mm256_set1_epi64x(0x8000000000000000);
-
-	__m256d x1 = _mm256_load_pd(&q[ldq]);
-
-	__m256d h1 = _mm256_broadcast_sd(&hh[ldh+1]);
-	__m256d h2;
-
-#ifdef __ELPA_USE_FMA__
-	__m256d q1 = _mm256_load_pd(q);
-	__m256d y1 = _mm256_FMA_pd(x1, h1, q1);
-#else
-	__m256d q1 = _mm256_load_pd(q);
-	__m256d y1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
-#endif
-
-	for(i = 2; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-1]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		x1 = _mm256_FMA_pd(q1, h1, x1);
-		y1 = _mm256_FMA_pd(q1, h2, y1);
-#else
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-#else
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-2 update of Q [4 x nb+1]
-	/////////////////////////////////////////////////////
-
-	__m256d tau1 = _mm256_broadcast_sd(hh);
-	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
-	__m256d vs = _mm256_broadcast_sd(&s);
-
-	h1 = _mm256_xor_pd(tau1, sign);
-	x1 = _mm256_mul_pd(x1, h1);
-	h1 = _mm256_xor_pd(tau2, sign);
-	h2 = _mm256_mul_pd(h1, vs);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(y1, h1, _mm256_mul_pd(x1,h2));
-#else
-	y1 = _mm256_add_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
-#endif
-
-	q1 = _mm256_load_pd(q);
-	q1 = _mm256_add_pd(q1, y1);
-	_mm256_store_pd(q,q1);
-
-	h2 = _mm256_broadcast_sd(&hh[ldh+1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_FMA_pd(y1, h2, x1));
-	_mm256_store_pd(&q[ldq],q1);
-#else
-	q1 = _mm256_load_pd(&q[ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_add_pd(x1, _mm256_mul_pd(y1, h2)));
-	_mm256_store_pd(&q[ldq],q1);
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-1]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q1 = _mm256_FMA_pd(x1, h1, q1);
-		q1 = _mm256_FMA_pd(y1, h2, q1);
-		_mm256_store_pd(&q[i*ldq],q1);
-#else
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q1 = _mm256_add_pd(q1, _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2)));
-		_mm256_store_pd(&q[i*ldq],q1);
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q1 = _mm256_FMA_pd(x1, h1, q1);
-	_mm256_store_pd(&q[nb*ldq],q1);
-#else
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
-	_mm256_store_pd(&q[nb*ldq],q1);
-#endif
-}
-#else
-/**
- * Unrolled kernel that computes
- * 12 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 2 update is performed
- */
- __forceinline void hh_trafo_kernel_12_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [12 x nb+1] * hh
-	// hh contains two householder vectors, with offset 1
-	/////////////////////////////////////////////////////
-	int i;
-	// Needed bit mask for floating point sign flip
-	__m64 smallsign = _mm_set_pi32(0x80000000, 0x00000000);
-	__m128d sign = (__m128d)_mm_set1_epi64(smallsign);
-
-	__m128d x1 = _mm_load_pd(&q[ldq]);
-	__m128d x2 = _mm_load_pd(&q[ldq+2]);
-	__m128d x3 = _mm_load_pd(&q[ldq+4]);
-	__m128d x4 = _mm_load_pd(&q[ldq+6]);
-	__m128d x5 = _mm_load_pd(&q[ldq+8]);
-	__m128d x6 = _mm_load_pd(&q[ldq+10]);
-
-	__m128d h1 = _mm_loaddup_pd(&hh[ldh+1]);
-	__m128d h2;
-
-#ifdef __ELPA_USE_FMA__
-	__m128d q1 = _mm_load_pd(q);
-	__m128d y1 = _mm_macc_pd(x1, h1, q1);
-	__m128d q2 = _mm_load_pd(&q[2]);
-	__m128d y2 = _mm_macc_pd(x2, h1, q2);
-	__m128d q3 = _mm_load_pd(&q[4]);
-	__m128d y3 = _mm_macc_pd(x3, h1, q3);
-	__m128d q4 = _mm_load_pd(&q[6]);
-	__m128d y4 = _mm_macc_pd(x4, h1, q4);
-	__m128d q5 = _mm_load_pd(&q[8]);
-	__m128d y5 = _mm_macc_pd(x5, h1, q5);
-	__m128d q6 = _mm_load_pd(&q[10]);
-	__m128d y6 = _mm_macc_pd(x6, h1, q6);
-#else
-	__m128d q1 = _mm_load_pd(q);
-	__m128d y1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1));
-	__m128d q2 = _mm_load_pd(&q[2]);
-	__m128d y2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1));
-	__m128d q3 = _mm_load_pd(&q[4]);
-	__m128d y3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1));
-	__m128d q4 = _mm_load_pd(&q[6]);
-	__m128d y4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1));
-	__m128d q5 = _mm_load_pd(&q[8]);
-	__m128d y5 = _mm_add_pd(q5, _mm_mul_pd(x5, h1));
-	__m128d q6 = _mm_load_pd(&q[10]);
-	__m128d y6 = _mm_add_pd(q6, _mm_mul_pd(x6, h1));
-#endif
-	for(i = 2; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-1]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_load_pd(&q[i*ldq]);
-		x1 = _mm_macc_pd(q1, h1, x1);
-		y1 = _mm_macc_pd(q1, h2, y1);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		x2 = _mm_macc_pd(q2, h1, x2);
-		y2 = _mm_macc_pd(q2, h2, y2);
-		q3 = _mm_load_pd(&q[(i*ldq)+4]);
-		x3 = _mm_macc_pd(q3, h1, x3);
-		y3 = _mm_macc_pd(q3, h2, y3);
-		q4 = _mm_load_pd(&q[(i*ldq)+6]);
-		x4 = _mm_macc_pd(q4, h1, x4);
-		y4 = _mm_macc_pd(q4, h2, y4);
-		q5 = _mm_load_pd(&q[(i*ldq)+8]);
-		x5 = _mm_macc_pd(q5, h1, x5);
-		y5 = _mm_macc_pd(q5, h2, y5);
-		q6 = _mm_load_pd(&q[(i*ldq)+10]);
-		x6 = _mm_macc_pd(q6, h1, x6);
-		y6 = _mm_macc_pd(q6, h2, y6);
-#else
-		q1 = _mm_load_pd(&q[i*ldq]);
-		x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-		y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-		y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-		q3 = _mm_load_pd(&q[(i*ldq)+4]);
-		x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1));
-		y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2));
-		q4 = _mm_load_pd(&q[(i*ldq)+6]);
-		x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1));
-		y4 = _mm_add_pd(y4, _mm_mul_pd(q4,h2));
-		q5 = _mm_load_pd(&q[(i*ldq)+8]);
-		x5 = _mm_add_pd(x5, _mm_mul_pd(q5,h1));
-		y5 = _mm_add_pd(y5, _mm_mul_pd(q5,h2));
-		q6 = _mm_load_pd(&q[(i*ldq)+10]);
-		x6 = _mm_add_pd(x6, _mm_mul_pd(q6,h1));
-		y6 = _mm_add_pd(y6, _mm_mul_pd(q6,h2));
-#endif
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	x1 = _mm_macc_pd(q1, h1, x1);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	x2 = _mm_macc_pd(q2, h1, x2);
-	q3 = _mm_load_pd(&q[(nb*ldq)+4]);
-	x3 = _mm_macc_pd(q3, h1, x3);
-	q4 = _mm_load_pd(&q[(nb*ldq)+6]);
-	x4 = _mm_macc_pd(q4, h1, x4);
-	q5 = _mm_load_pd(&q[(nb*ldq)+8]);
-	x5 = _mm_macc_pd(q5, h1, x5);
-	q6 = _mm_load_pd(&q[(nb*ldq)+10]);
-	x6 = _mm_macc_pd(q6, h1, x6);
-#else
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-	q3 = _mm_load_pd(&q[(nb*ldq)+4]);
-	x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1));
-	q4 = _mm_load_pd(&q[(nb*ldq)+6]);
-	x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1));
-	q5 = _mm_load_pd(&q[(nb*ldq)+8]);
-	x5 = _mm_add_pd(x5, _mm_mul_pd(q5,h1));
-	q6 = _mm_load_pd(&q[(nb*ldq)+10]);
-	x6 = _mm_add_pd(x6, _mm_mul_pd(q6,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-2 update of Q [12 x nb+1]
-	/////////////////////////////////////////////////////
-
-	__m128d tau1 = _mm_loaddup_pd(hh);
-	__m128d tau2 = _mm_loaddup_pd(&hh[ldh]);
-	__m128d vs = _mm_loaddup_pd(&s);
-
-	h1 = _mm_xor_pd(tau1, sign);
-	x1 = _mm_mul_pd(x1, h1);
-	x2 = _mm_mul_pd(x2, h1);
-	x3 = _mm_mul_pd(x3, h1);
-	x4 = _mm_mul_pd(x4, h1);
-	x5 = _mm_mul_pd(x5, h1);
-	x6 = _mm_mul_pd(x6, h1);
-	h1 = _mm_xor_pd(tau2, sign);
-	h2 = _mm_mul_pd(h1, vs);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_macc_pd(y1, h1, _mm_mul_pd(x1,h2));
-	y2 = _mm_macc_pd(y2, h1, _mm_mul_pd(x2,h2));
-	y3 = _mm_macc_pd(y3, h1, _mm_mul_pd(x3,h2));
-	y4 = _mm_macc_pd(y4, h1, _mm_mul_pd(x4,h2));
-	y5 = _mm_macc_pd(y5, h1, _mm_mul_pd(x5,h2));
-	y6 = _mm_macc_pd(y6, h1, _mm_mul_pd(x6,h2));
-#else
-	y1 = _mm_add_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2));
-	y2 = _mm_add_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2));
-	y3 = _mm_add_pd(_mm_mul_pd(y3,h1), _mm_mul_pd(x3,h2));
-	y4 = _mm_add_pd(_mm_mul_pd(y4,h1), _mm_mul_pd(x4,h2));
-	y5 = _mm_add_pd(_mm_mul_pd(y5,h1), _mm_mul_pd(x5,h2));
-	y6 = _mm_add_pd(_mm_mul_pd(y6,h1), _mm_mul_pd(x6,h2));
-#endif
-
-	q1 = _mm_load_pd(q);
-	q1 = _mm_add_pd(q1, y1);
-	_mm_store_pd(q,q1);
-	q2 = _mm_load_pd(&q[2]);
-	q2 = _mm_add_pd(q2, y2);
-	_mm_store_pd(&q[2],q2);
-	q3 = _mm_load_pd(&q[4]);
-	q3 = _mm_add_pd(q3, y3);
-	_mm_store_pd(&q[4],q3);
-	q4 = _mm_load_pd(&q[6]);
-	q4 = _mm_add_pd(q4, y4);
-	_mm_store_pd(&q[6],q4);
-	q5 = _mm_load_pd(&q[8]);
-	q5 = _mm_add_pd(q5, y5);
-	_mm_store_pd(&q[8],q5);
-	q6 = _mm_load_pd(&q[10]);
-	q6 = _mm_add_pd(q6, y6);
-	_mm_store_pd(&q[10],q6);
-
-	h2 = _mm_loaddup_pd(&hh[ldh+1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_load_pd(&q[ldq]);
-	q1 = _mm_add_pd(q1, _mm_macc_pd(y1, h2, x1));
-	_mm_store_pd(&q[ldq],q1);
-	q2 = _mm_load_pd(&q[ldq+2]);
-	q2 = _mm_add_pd(q2, _mm_macc_pd(y2, h2, x2));
-	_mm_store_pd(&q[ldq+2],q2);
-	q3 = _mm_load_pd(&q[ldq+4]);
-	q3 = _mm_add_pd(q3, _mm_macc_pd(y3, h2, x3));
-	_mm_store_pd(&q[ldq+4],q3);
-	q4 = _mm_load_pd(&q[ldq+6]);
-	q4 = _mm_add_pd(q4, _mm_macc_pd(y4, h2, x4));
-	_mm_store_pd(&q[ldq+6],q4);
-	q5 = _mm_load_pd(&q[ldq+8]);
-	q5 = _mm_add_pd(q5, _mm_macc_pd(y5, h2, x5));
-	_mm_store_pd(&q[ldq+8],q5);
-	q6 = _mm_load_pd(&q[ldq+10]);
-	q6 = _mm_add_pd(q6, _mm_macc_pd(y6, h2, x6));
-	_mm_store_pd(&q[ldq+10],q6);
-#else
-	q1 = _mm_load_pd(&q[ldq]);
-	q1 = _mm_add_pd(q1, _mm_add_pd(x1, _mm_mul_pd(y1, h2)));
-	_mm_store_pd(&q[ldq],q1);
-	q2 = _mm_load_pd(&q[ldq+2]);
-	q2 = _mm_add_pd(q2, _mm_add_pd(x2, _mm_mul_pd(y2, h2)));
-	_mm_store_pd(&q[ldq+2],q2);
-	q3 = _mm_load_pd(&q[ldq+4]);
-	q3 = _mm_add_pd(q3, _mm_add_pd(x3, _mm_mul_pd(y3, h2)));
-	_mm_store_pd(&q[ldq+4],q3);
-	q4 = _mm_load_pd(&q[ldq+6]);
-	q4 = _mm_add_pd(q4, _mm_add_pd(x4, _mm_mul_pd(y4, h2)));
-	_mm_store_pd(&q[ldq+6],q4);
-	q5 = _mm_load_pd(&q[ldq+8]);
-	q5 = _mm_add_pd(q5, _mm_add_pd(x5, _mm_mul_pd(y5, h2)));
-	_mm_store_pd(&q[ldq+8],q5);
-	q6 = _mm_load_pd(&q[ldq+10]);
-	q6 = _mm_add_pd(q6, _mm_add_pd(x6, _mm_mul_pd(y6, h2)));
-	_mm_store_pd(&q[ldq+10],q6);
-#endif
-	for (i = 2; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-1]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i]);
-
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_load_pd(&q[i*ldq]);
-		q1 = _mm_add_pd(q1, _mm_macc_pd(x1, h1, _mm_mul_pd(y1, h2)));
-		_mm_store_pd(&q[i*ldq],q1);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		q2 = _mm_add_pd(q2, _mm_macc_pd(x2, h1, _mm_mul_pd(y2, h2)));
-		_mm_store_pd(&q[(i*ldq)+2],q2);
-		q3 = _mm_load_pd(&q[(i*ldq)+4]);
-		q3 = _mm_add_pd(q3, _mm_macc_pd(x3, h1, _mm_mul_pd(y3, h2)));
-		_mm_store_pd(&q[(i*ldq)+4],q3);
-		q4 = _mm_load_pd(&q[(i*ldq)+6]);
-		q4 = _mm_add_pd(q4, _mm_macc_pd(x4, h1, _mm_mul_pd(y4, h2)));
-		_mm_store_pd(&q[(i*ldq)+6],q4);
-		q5 = _mm_load_pd(&q[(i*ldq)+8]);
-		q5 = _mm_add_pd(q5, _mm_macc_pd(x5, h1, _mm_mul_pd(y5, h2)));
-		_mm_store_pd(&q[(i*ldq)+8],q5);
-		q6 = _mm_load_pd(&q[(i*ldq)+10]);
-		q6 = _mm_add_pd(q6, _mm_macc_pd(x6, h1, _mm_mul_pd(y6, h2)));
-		_mm_store_pd(&q[(i*ldq)+10],q6);
-#else
-		q1 = _mm_load_pd(&q[i*ldq]);
-		q1 = _mm_add_pd(q1, _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2)));
-		_mm_store_pd(&q[i*ldq],q1);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		q2 = _mm_add_pd(q2, _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2)));
-		_mm_store_pd(&q[(i*ldq)+2],q2);
-		q3 = _mm_load_pd(&q[(i*ldq)+4]);
-		q3 = _mm_add_pd(q3, _mm_add_pd(_mm_mul_pd(x3,h1), _mm_mul_pd(y3, h2)));
-		_mm_store_pd(&q[(i*ldq)+4],q3);
-		q4 = _mm_load_pd(&q[(i*ldq)+6]);
-		q4 = _mm_add_pd(q4, _mm_add_pd(_mm_mul_pd(x4,h1), _mm_mul_pd(y4, h2)));
-		_mm_store_pd(&q[(i*ldq)+6],q4);
-		q5 = _mm_load_pd(&q[(i*ldq)+8]);
-		q5 = _mm_add_pd(q5, _mm_add_pd(_mm_mul_pd(x5,h1), _mm_mul_pd(y5, h2)));
-		_mm_store_pd(&q[(i*ldq)+8],q5);
-		q6 = _mm_load_pd(&q[(i*ldq)+10]);
-		q6 = _mm_add_pd(q6, _mm_add_pd(_mm_mul_pd(x6,h1), _mm_mul_pd(y6, h2)));
-		_mm_store_pd(&q[(i*ldq)+10],q6);
-#endif
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q1 = _mm_macc_pd(x1, h1, q1);
-	_mm_store_pd(&q[nb*ldq],q1);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	q2 = _mm_macc_pd(x2, h1, q2);
-	_mm_store_pd(&q[(nb*ldq)+2],q2);
-	q3 = _mm_load_pd(&q[(nb*ldq)+4]);
-	q3 = _mm_macc_pd(x3, h1, q3);
-	_mm_store_pd(&q[(nb*ldq)+4],q3);
-	q4 = _mm_load_pd(&q[(nb*ldq)+6]);
-	q4 = _mm_macc_pd(x4, h1, q4);
-	_mm_store_pd(&q[(nb*ldq)+6],q4);
-	q5 = _mm_load_pd(&q[(nb*ldq)+8]);
-	q5 = _mm_macc_pd(x5, h1, q5);
-	_mm_store_pd(&q[(nb*ldq)+8],q5);
-	q6 = _mm_load_pd(&q[(nb*ldq)+10]);
-	q6 = _mm_macc_pd(x6, h1, q6);
-	_mm_store_pd(&q[(nb*ldq)+10],q6);
-#else
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1));
-	_mm_store_pd(&q[nb*ldq],q1);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	q2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1));
-	_mm_store_pd(&q[(nb*ldq)+2],q2);
-	q3 = _mm_load_pd(&q[(nb*ldq)+4]);
-	q3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1));
-	_mm_store_pd(&q[(nb*ldq)+4],q3);
-	q4 = _mm_load_pd(&q[(nb*ldq)+6]);
-	q4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1));
-	_mm_store_pd(&q[(nb*ldq)+6],q4);
-	q5 = _mm_load_pd(&q[(nb*ldq)+8]);
-	q5 = _mm_add_pd(q5, _mm_mul_pd(x5, h1));
-	_mm_store_pd(&q[(nb*ldq)+8],q5);
-	q6 = _mm_load_pd(&q[(nb*ldq)+10]);
-	q6 = _mm_add_pd(q6, _mm_mul_pd(x6, h1));
-	_mm_store_pd(&q[(nb*ldq)+10],q6);
-#endif
-}
-
-/**
- * Unrolled kernel that computes
- * 8 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 2 update is performed
- */
-__forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [8 x nb+1] * hh
-	// hh contains two householder vectors, with offset 1
-	/////////////////////////////////////////////////////
-	int i;
-	// Needed bit mask for floating point sign flip
-	__m64 smallsign = _mm_set_pi32(0x80000000, 0x00000000);
-	__m128d sign = (__m128d)_mm_set1_epi64(smallsign);
-
-	__m128d x1 = _mm_load_pd(&q[ldq]);
-	__m128d x2 = _mm_load_pd(&q[ldq+2]);
-	__m128d x3 = _mm_load_pd(&q[ldq+4]);
-	__m128d x4 = _mm_load_pd(&q[ldq+6]);
-
-	__m128d h1 = _mm_loaddup_pd(&hh[ldh+1]);
-	__m128d h2;
-
-#ifdef __ELPA_USE_FMA__
-	__m128d q1 = _mm_load_pd(q);
-	__m128d y1 = _mm_macc_pd(x1, h1, q1);
-	__m128d q2 = _mm_load_pd(&q[2]);
-	__m128d y2 = _mm_macc_pd(x2, h1, q2);
-	__m128d q3 = _mm_load_pd(&q[4]);
-	__m128d y3 = _mm_macc_pd(x3, h1, q3);
-	__m128d q4 = _mm_load_pd(&q[6]);
-	__m128d y4 = _mm_macc_pd(x4, h1, q4);
-#else
-	__m128d q1 = _mm_load_pd(q);
-	__m128d y1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1));
-	__m128d q2 = _mm_load_pd(&q[2]);
-	__m128d y2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1));
-	__m128d q3 = _mm_load_pd(&q[4]);
-	__m128d y3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1));
-	__m128d q4 = _mm_load_pd(&q[6]);
-	__m128d y4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1));
-#endif
-
-	for(i = 2; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-1]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_load_pd(&q[i*ldq]);
-		x1 = _mm_macc_pd(q1, h1, x1);
-		y1 = _mm_macc_pd(q1, h2, y1);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		x2 = _mm_macc_pd(q2, h1, x2);
-		y2 = _mm_macc_pd(q2, h2, y2);
-		q3 = _mm_load_pd(&q[(i*ldq)+4]);
-		x3 = _mm_macc_pd(q3, h1, x3);
-		y3 = _mm_macc_pd(q3, h2, y3);
-		q4 = _mm_load_pd(&q[(i*ldq)+6]);
-		x4 = _mm_macc_pd(q4, h1, x4);
-		y4 = _mm_macc_pd(q4, h2, y4);
-#else
-		q1 = _mm_load_pd(&q[i*ldq]);
-		x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-		y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-		y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-		q3 = _mm_load_pd(&q[(i*ldq)+4]);
-		x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1));
-		y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2));
-		q4 = _mm_load_pd(&q[(i*ldq)+6]);
-		x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1));
-		y4 = _mm_add_pd(y4, _mm_mul_pd(q4,h2));
-#endif
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	x1 = _mm_macc_pd(q1, h1, x1);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	x2 = _mm_macc_pd(q2, h1, x2);
-	q3 = _mm_load_pd(&q[(nb*ldq)+4]);
-	x3 = _mm_macc_pd(q3, h1, x3);
-	q4 = _mm_load_pd(&q[(nb*ldq)+6]);
-	x4 = _mm_macc_pd(q4, h1, x4);
-#else
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-	q3 = _mm_load_pd(&q[(nb*ldq)+4]);
-	x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1));
-	q4 = _mm_load_pd(&q[(nb*ldq)+6]);
-	x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-2 update of Q [8 x nb+1]
-	/////////////////////////////////////////////////////
-
-	__m128d tau1 = _mm_loaddup_pd(hh);
-	__m128d tau2 = _mm_loaddup_pd(&hh[ldh]);
-	__m128d vs = _mm_loaddup_pd(&s);
-
-	h1 = _mm_xor_pd(tau1, sign);
-	x1 = _mm_mul_pd(x1, h1);
-	x2 = _mm_mul_pd(x2, h1);
-	x3 = _mm_mul_pd(x3, h1);
-	x4 = _mm_mul_pd(x4, h1);
-	h1 = _mm_xor_pd(tau2, sign);
-	h2 = _mm_mul_pd(h1, vs);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_macc_pd(y1, h1, _mm_mul_pd(x1,h2));
-	y2 = _mm_macc_pd(y2, h1, _mm_mul_pd(x2,h2));
-	y3 = _mm_macc_pd(y3, h1, _mm_mul_pd(x3,h2));
-	y4 = _mm_macc_pd(y4, h1, _mm_mul_pd(x4,h2));
-#else
-	y1 = _mm_add_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2));
-	y2 = _mm_add_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2));
-	y3 = _mm_add_pd(_mm_mul_pd(y3,h1), _mm_mul_pd(x3,h2));
-	y4 = _mm_add_pd(_mm_mul_pd(y4,h1), _mm_mul_pd(x4,h2));
-#endif
-
-	q1 = _mm_load_pd(q);
-	q1 = _mm_add_pd(q1, y1);
-	_mm_store_pd(q,q1);
-	q2 = _mm_load_pd(&q[2]);
-	q2 = _mm_add_pd(q2, y2);
-	_mm_store_pd(&q[2],q2);
-	q3 = _mm_load_pd(&q[4]);
-	q3 = _mm_add_pd(q3, y3);
-	_mm_store_pd(&q[4],q3);
-	q4 = _mm_load_pd(&q[6]);
-	q4 = _mm_add_pd(q4, y4);
-	_mm_store_pd(&q[6],q4);
-
-	h2 = _mm_loaddup_pd(&hh[ldh+1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_load_pd(&q[ldq]);
-	q1 = _mm_add_pd(q1, _mm_macc_pd(y1, h2, x1));
-	_mm_store_pd(&q[ldq],q1);
-	q2 = _mm_load_pd(&q[ldq+2]);
-	q2 = _mm_add_pd(q2, _mm_macc_pd(y2, h2, x2));
-	_mm_store_pd(&q[ldq+2],q2);
-	q3 = _mm_load_pd(&q[ldq+4]);
-	q3 = _mm_add_pd(q3, _mm_macc_pd(y3, h2, x3));
-	_mm_store_pd(&q[ldq+4],q3);
-	q4 = _mm_load_pd(&q[ldq+6]);
-	q4 = _mm_add_pd(q4, _mm_macc_pd(y4, h2, x4));
-	_mm_store_pd(&q[ldq+6],q4);
-#else
-	q1 = _mm_load_pd(&q[ldq]);
-	q1 = _mm_add_pd(q1, _mm_add_pd(x1, _mm_mul_pd(y1, h2)));
-	_mm_store_pd(&q[ldq],q1);
-	q2 = _mm_load_pd(&q[ldq+2]);
-	q2 = _mm_add_pd(q2, _mm_add_pd(x2, _mm_mul_pd(y2, h2)));
-	_mm_store_pd(&q[ldq+2],q2);
-	q3 = _mm_load_pd(&q[ldq+4]);
-	q3 = _mm_add_pd(q3, _mm_add_pd(x3, _mm_mul_pd(y3, h2)));
-	_mm_store_pd(&q[ldq+4],q3);
-	q4 = _mm_load_pd(&q[ldq+6]);
-	q4 = _mm_add_pd(q4, _mm_add_pd(x4, _mm_mul_pd(y4, h2)));
-	_mm_store_pd(&q[ldq+6],q4);
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-1]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i]);
-
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_load_pd(&q[i*ldq]);
-		q1 = _mm_add_pd(q1, _mm_macc_pd(x1, h1, _mm_mul_pd(y1, h2)));
-		_mm_store_pd(&q[i*ldq],q1);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		q2 = _mm_add_pd(q2, _mm_macc_pd(x2, h1, _mm_mul_pd(y2, h2)));
-		_mm_store_pd(&q[(i*ldq)+2],q2);
-		q3 = _mm_load_pd(&q[(i*ldq)+4]);
-		q3 = _mm_add_pd(q3, _mm_macc_pd(x3, h1, _mm_mul_pd(y3, h2)));
-		_mm_store_pd(&q[(i*ldq)+4],q3);
-		q4 = _mm_load_pd(&q[(i*ldq)+6]);
-		q4 = _mm_add_pd(q4, _mm_macc_pd(x4, h1, _mm_mul_pd(y4, h2)));
-		_mm_store_pd(&q[(i*ldq)+6],q4);
-#else
-		q1 = _mm_load_pd(&q[i*ldq]);
-		q1 = _mm_add_pd(q1, _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2)));
-		_mm_store_pd(&q[i*ldq],q1);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		q2 = _mm_add_pd(q2, _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2)));
-		_mm_store_pd(&q[(i*ldq)+2],q2);
-		q3 = _mm_load_pd(&q[(i*ldq)+4]);
-		q3 = _mm_add_pd(q3, _mm_add_pd(_mm_mul_pd(x3,h1), _mm_mul_pd(y3, h2)));
-		_mm_store_pd(&q[(i*ldq)+4],q3);
-		q4 = _mm_load_pd(&q[(i*ldq)+6]);
-		q4 = _mm_add_pd(q4, _mm_add_pd(_mm_mul_pd(x4,h1), _mm_mul_pd(y4, h2)));
-		_mm_store_pd(&q[(i*ldq)+6],q4);
-#endif
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q1 = _mm_macc_pd(x1, h1, q1);
-	_mm_store_pd(&q[nb*ldq],q1);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	q2 = _mm_macc_pd(x2, h1, q2);
-	_mm_store_pd(&q[(nb*ldq)+2],q2);
-	q3 = _mm_load_pd(&q[(nb*ldq)+4]);
-	q3 = _mm_macc_pd(x3, h1, q3);
-	_mm_store_pd(&q[(nb*ldq)+4],q3);
-	q4 = _mm_load_pd(&q[(nb*ldq)+6]);
-	q4 = _mm_macc_pd(x4, h1, q4);
-	_mm_store_pd(&q[(nb*ldq)+6],q4);
-#else
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1));
-	_mm_store_pd(&q[nb*ldq],q1);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	q2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1));
-	_mm_store_pd(&q[(nb*ldq)+2],q2);
-	q3 = _mm_load_pd(&q[(nb*ldq)+4]);
-	q3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1));
-	_mm_store_pd(&q[(nb*ldq)+4],q3);
-	q4 = _mm_load_pd(&q[(nb*ldq)+6]);
-	q4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1));
-	_mm_store_pd(&q[(nb*ldq)+6],q4);
-#endif
-}
-
-/**
- * Unrolled kernel that computes
- * 4 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 2 update is performed
- */
-__forceinline void hh_trafo_kernel_4_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [4 x nb+1] * hh
-	// hh contains two householder vectors, with offset 1
-	/////////////////////////////////////////////////////
-	int i;
-	// Needed bit mask for floating point sign flip
-	__m64 smallsign = _mm_set_pi32(0x80000000, 0x00000000);
-	__m128d sign = (__m128d)_mm_set1_epi64(smallsign);
-
-	__m128d x1 = _mm_load_pd(&q[ldq]);
-	__m128d x2 = _mm_load_pd(&q[ldq+2]);
-
-	__m128d h1 = _mm_loaddup_pd(&hh[ldh+1]);
-	__m128d h2;
-
-#ifdef __ELPA_USE_FMA__
-	__m128d q1 = _mm_load_pd(q);
-	__m128d y1 = _mm_macc_pd(x1, h1, q1);
-	__m128d q2 = _mm_load_pd(&q[2]);
-	__m128d y2 = _mm_macc_pd(x2, h1, q2);
-#else
-	__m128d q1 = _mm_load_pd(q);
-	__m128d y1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1));
-	__m128d q2 = _mm_load_pd(&q[2]);
-	__m128d y2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1));
-#endif
-
-	for(i = 2; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-1]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_load_pd(&q[i*ldq]);
-		x1 = _mm_macc_pd(q1, h1, x1);
-		y1 = _mm_macc_pd(q1, h2, y1);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		x2 = _mm_macc_pd(q2, h1, x2);
-		y2 = _mm_macc_pd(q2, h2, y2);
-#else
-		q1 = _mm_load_pd(&q[i*ldq]);
-		x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-		y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-		y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-#endif
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	x1 = _mm_macc_pd(q1, h1, x1);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	x2 = _mm_macc_pd(q2, h1, x2);
-#else
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-2 update of Q [12 x nb+1]
-	/////////////////////////////////////////////////////
-
-	__m128d tau1 = _mm_loaddup_pd(hh);
-	__m128d tau2 = _mm_loaddup_pd(&hh[ldh]);
-	__m128d vs = _mm_loaddup_pd(&s);
-
-	h1 = _mm_xor_pd(tau1, sign);
-	x1 = _mm_mul_pd(x1, h1);
-	x2 = _mm_mul_pd(x2, h1);
-	h1 = _mm_xor_pd(tau2, sign);
-	h2 = _mm_mul_pd(h1, vs);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_macc_pd(y1, h1, _mm_mul_pd(x1,h2));
-	y2 = _mm_macc_pd(y2, h1, _mm_mul_pd(x2,h2));
-#else
-	y1 = _mm_add_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2));
-	y2 = _mm_add_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2));
-#endif
-
-	q1 = _mm_load_pd(q);
-	q1 = _mm_add_pd(q1, y1);
-	_mm_store_pd(q,q1);
-	q2 = _mm_load_pd(&q[2]);
-	q2 = _mm_add_pd(q2, y2);
-	_mm_store_pd(&q[2],q2);
-
-	h2 = _mm_loaddup_pd(&hh[ldh+1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_load_pd(&q[ldq]);
-	q1 = _mm_add_pd(q1, _mm_macc_pd(y1, h2, x1));
-	_mm_store_pd(&q[ldq],q1);
-	q2 = _mm_load_pd(&q[ldq+2]);
-	q2 = _mm_add_pd(q2, _mm_macc_pd(y2, h2, x2));
-	_mm_store_pd(&q[ldq+2],q2);
-#else
-	q1 = _mm_load_pd(&q[ldq]);
-	q1 = _mm_add_pd(q1, _mm_add_pd(x1, _mm_mul_pd(y1, h2)));
-	_mm_store_pd(&q[ldq],q1);
-	q2 = _mm_load_pd(&q[ldq+2]);
-	q2 = _mm_add_pd(q2, _mm_add_pd(x2, _mm_mul_pd(y2, h2)));
-	_mm_store_pd(&q[ldq+2],q2);
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-1]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i]);
-
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_load_pd(&q[i*ldq]);
-		q1 = _mm_add_pd(q1, _mm_macc_pd(x1, h1, _mm_mul_pd(y1, h2)));
-		_mm_store_pd(&q[i*ldq],q1);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		q2 = _mm_add_pd(q2, _mm_macc_pd(x2, h1, _mm_mul_pd(y2, h2)));
-		_mm_store_pd(&q[(i*ldq)+2],q2);
-#else
-		q1 = _mm_load_pd(&q[i*ldq]);
-		q1 = _mm_add_pd(q1, _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2)));
-		_mm_store_pd(&q[i*ldq],q1);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		q2 = _mm_add_pd(q2, _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2)));
-		_mm_store_pd(&q[(i*ldq)+2],q2);
-#endif
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q1 = _mm_macc_pd(x1, h1, q1);
-	_mm_store_pd(&q[nb*ldq],q1);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	q2 = _mm_macc_pd(x2, h1, q2);
-	_mm_store_pd(&q[(nb*ldq)+2],q2);
-#else
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1));
-	_mm_store_pd(&q[nb*ldq],q1);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	q2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1));
-	_mm_store_pd(&q[(nb*ldq)+2],q2);
-#endif
-}
-#endif
diff --git a/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c b/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
deleted file mode 100644
index 007cf94cc..000000000
--- a/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
+++ /dev/null
@@ -1,2471 +0,0 @@
-//    This file is part of ELPA.
-//
-//    The ELPA library was originally created by the ELPA consortium,
-//    consisting of the following organizations:
-//
-//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-//      Informatik,
-//    - Technische Universität München, Lehrstuhl für Informatik mit
-//      Schwerpunkt Wissenschaftliches Rechnen ,
-//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-//      and
-//    - IBM Deutschland GmbH
-//
-//    This particular source code file contains additions, changes and
-//    enhancements authored by Intel Corporation which is not part of
-//    the ELPA consortium.
-//
-//    More information can be found here:
-//    http://elpa.mpcdf.mpg.de/
-//
-//    ELPA is free software: you can redistribute it and/or modify
-//    it under the terms of the version 3 of the license of the
-//    GNU Lesser General Public License as published by the Free
-//    Software Foundation.
-//
-//    ELPA is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU Lesser General Public License for more details.
-//
-//    You should have received a copy of the GNU Lesser General Public License
-//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-//
-//    ELPA reflects a substantial effort on the part of the original
-//    ELPA consortium, and we ask you to respect the spirit of the
-//    license that we chose: i.e., please contribute any changes you
-//    may have back to the original ELPA library distribution, and keep
-//    any derivatives of ELPA under the same license that we chose for
-//    the original distribution, the GNU Lesser General Public License.
-//
-//
-// --------------------------------------------------------------------------------------------------
-//
-// This file contains the compute intensive kernels for the Householder transformations.
-// It should be compiled with the highest possible optimization level.
-//
-// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
-// On Intel Sandy Bridge use -O3 -mavx
-//
-// Copyright of the original code rests with the authors inside the ELPA
-// consortium. The copyright of any additional modifications shall rest
-// with their original authors, but shall adhere to the licensing terms
-// distributed along with the original code in the file "COPYING".
-//
-// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
-// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
-// --------------------------------------------------------------------------------------------------
-
-#include <x86intrin.h>
-
-#define __forceinline __attribute__((always_inline)) static
-
-#ifdef __USE_AVX128__
-#undef __AVX__
-#endif
-
-#ifdef __FMA4__
-#define __ELPA_USE_FMA__
-#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
-#define _mm256_NFMA_pd(a,b,c) _mm256_nmacc_pd(a,b,c)
-#define _mm256_FMSUB_pd(a,b,c) _mm256_msub(a,b,c)
-#endif
-
-#ifdef __AVX2__
-#define __ELPA_USE_FMA__
-#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
-#define _mm256_NFMA_pd(a,b,c) _mm256_fnmadd_pd(a,b,c)
-#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
-#endif
-
-//Forward declaration
-#ifdef __AVX__
-__forceinline void hh_trafo_kernel_4_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
-__forceinline void hh_trafo_kernel_8_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
-__forceinline void hh_trafo_kernel_12_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
-#else
-__forceinline void hh_trafo_kernel_2_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
-__forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
-__forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
-#endif
-
-void quad_hh_trafo_real_sse_avx_4hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
-#if 0
-void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
-#endif
-
-void quad_hh_trafo_real_sse_avx_4hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	int ldh = *pldh;
-
-	// calculating scalar products to compute
-	// 4 householder vectors simultaneously
-	double s_1_2 = hh[(ldh)+1];
-	double s_1_3 = hh[(ldh*2)+2];
-	double s_2_3 = hh[(ldh*2)+1];
-	double s_1_4 = hh[(ldh*3)+3];
-	double s_2_4 = hh[(ldh*3)+2];
-	double s_3_4 = hh[(ldh*3)+1];
-
-	// calculate scalar product of first and fourth householder vector
-	// loop counter = 2
-	s_1_2 += hh[2-1] * hh[(2+ldh)];
-	s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)];
-	s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)];
-
-	// loop counter = 3
-	s_1_2 += hh[3-1] * hh[(3+ldh)];
-	s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)];
-	s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)];
-
-	s_1_3 += hh[3-2] * hh[3+(ldh*2)];
-	s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)];
-
-	#pragma ivdep
-	for (i = 4; i < nb; i++)
-	{
-		s_1_2 += hh[i-1] * hh[(i+ldh)];
-		s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)];
-		s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
-
-		s_1_3 += hh[i-2] * hh[i+(ldh*2)];
-		s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
-
-		s_1_4 += hh[i-3] * hh[i+(ldh*3)];
-	}
-
-//	printf("s_1_2: %f\n", s_1_2);
-//	printf("s_1_3: %f\n", s_1_3);
-//	printf("s_2_3: %f\n", s_2_3);
-//	printf("s_1_4: %f\n", s_1_4);
-//	printf("s_2_4: %f\n", s_2_4);
-//	printf("s_3_4: %f\n", s_3_4);
-
-	// Production level kernel calls with padding
-#ifdef __AVX__
-	for (i = 0; i < nq-8; i+=12)
-	{
-		hh_trafo_kernel_12_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-	}
-	if (nq == i)
-	{
-		return;
-	}
-	else
-	{
-		if (nq-i > 4)
-		{
-			hh_trafo_kernel_8_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-		}
-		else
-		{
-			hh_trafo_kernel_4_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-		}
-	}
-#else
-	for (i = 0; i < nq-4; i+=6)
-	{
-		hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-	}
-	if (nq == i)
-	{
-		return;
-	}
-	else
-	{
-		if (nq-i > 2)
-		{
-			hh_trafo_kernel_4_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-		}
-		else
-		{
-			hh_trafo_kernel_2_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-		}
-	}
-#endif
-}
-
-#if 0
-void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	int ldh = *pldh;
-
-	// calculating scalar products to compute
-	// 4 householder vectors simultaneously
-	double s_1_2 = hh[(ldh)+1];
-	double s_1_3 = hh[(ldh*2)+2];
-	double s_2_3 = hh[(ldh*2)+1];
-	double s_1_4 = hh[(ldh*3)+3];
-	double s_2_4 = hh[(ldh*3)+2];
-	double s_3_4 = hh[(ldh*3)+1];
-
-	// calculate scalar product of first and fourth householder vector
-	// loop counter = 2
-	s_1_2 += hh[2-1] * hh[(2+ldh)];
-	s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)];
-	s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)];
-
-	// loop counter = 3
-	s_1_2 += hh[3-1] * hh[(3+ldh)];
-	s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)];
-	s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)];
-
-	s_1_3 += hh[3-2] * hh[3+(ldh*2)];
-	s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)];
-
-	#pragma ivdep
-	for (i = 4; i < nb; i++)
-	{
-		s_1_2 += hh[i-1] * hh[(i+ldh)];
-		s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)];
-		s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
-
-		s_1_3 += hh[i-2] * hh[i+(ldh*2)];
-		s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
-
-		s_1_4 += hh[i-3] * hh[i+(ldh*3)];
-	}
-
-	// Production level kernel calls with padding
-#ifdef __AVX__
-	for (i = 0; i < nq; i+=12)
-	{
-		hh_trafo_kernel_12_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-	}
-#else
-	for (i = 0; i < nq; i+=6)
-	{
-		hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-	}
-#endif
-}
-#endif
-
-#ifdef __AVX__
-/**
- * Unrolled kernel that computes
- * 12 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_12_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [12 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m256d a1_1 = _mm256_load_pd(&q[ldq*3]);
-	__m256d a2_1 = _mm256_load_pd(&q[ldq*2]);
-	__m256d a3_1 = _mm256_load_pd(&q[ldq]);
-	__m256d a4_1 = _mm256_load_pd(&q[0]);
-
-	__m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]);
-	__m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-	__m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-	__m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	__m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-	__m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-
-#ifdef __ELPA_USE_FMA__
-	register __m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1);
-	w1 = _mm256_FMA_pd(a2_1, h_4_2, w1);
-	w1 = _mm256_FMA_pd(a1_1, h_4_1, w1);
-	register __m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1);
-	z1 = _mm256_FMA_pd(a1_1, h_3_1, z1);
-	register __m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1);
-	register __m256d x1 = a1_1;
-#else
-	register __m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1));
-	register __m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2));
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1));
-	register __m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1));
-	register __m256d x1 = a1_1;
-#endif
-
-	__m256d a1_2 = _mm256_load_pd(&q[(ldq*3)+4]);
-	__m256d a2_2 = _mm256_load_pd(&q[(ldq*2)+4]);
-	__m256d a3_2 = _mm256_load_pd(&q[ldq+4]);
-	__m256d a4_2 = _mm256_load_pd(&q[0+4]);
-
-#ifdef __ELPA_USE_FMA__
-	register __m256d w2 = _mm256_FMA_pd(a3_2, h_4_3, a4_2);
-	w2 = _mm256_FMA_pd(a2_2, h_4_2, w2);
-	w2 = _mm256_FMA_pd(a1_2, h_4_1, w2);
-	register __m256d z2 = _mm256_FMA_pd(a2_2, h_3_2, a3_2);
-	z2 = _mm256_FMA_pd(a1_2, h_3_1, z2);
-	register __m256d y2 = _mm256_FMA_pd(a1_2, h_2_1, a2_2);
-	register __m256d x2 = a1_2;
-#else
-	register __m256d w2 = _mm256_add_pd(a4_2, _mm256_mul_pd(a3_2, h_4_3));
-	w2 = _mm256_add_pd(w2, _mm256_mul_pd(a2_2, h_4_2));
-	w2 = _mm256_add_pd(w2, _mm256_mul_pd(a1_2, h_4_1));
-	register __m256d z2 = _mm256_add_pd(a3_2, _mm256_mul_pd(a2_2, h_3_2));
-	z2 = _mm256_add_pd(z2, _mm256_mul_pd(a1_2, h_3_1));
-	register __m256d y2 = _mm256_add_pd(a2_2, _mm256_mul_pd(a1_2, h_2_1));
-	register __m256d x2 = a1_2;
-#endif
-
-	__m256d a1_3 = _mm256_load_pd(&q[(ldq*3)+8]);
-	__m256d a2_3 = _mm256_load_pd(&q[(ldq*2)+8]);
-	__m256d a3_3 = _mm256_load_pd(&q[ldq+8]);
-	__m256d a4_3 = _mm256_load_pd(&q[0+8]);
-
-#ifdef __ELPA_USE_FMA__
-	register __m256d w3 = _mm256_FMA_pd(a3_3, h_4_3, a4_3);
-	w3 = _mm256_FMA_pd(a2_3, h_4_2, w3);
-	w3 = _mm256_FMA_pd(a1_3, h_4_1, w3);
-	register __m256d z3 = _mm256_FMA_pd(a2_3, h_3_2, a3_3);
-	z3 = _mm256_FMA_pd(a1_3, h_3_1, z3);
-	register __m256d y3 = _mm256_FMA_pd(a1_3, h_2_1, a2_3);
-	register __m256d x3 = a1_3;
-#else
-	register __m256d w3 = _mm256_add_pd(a4_3, _mm256_mul_pd(a3_3, h_4_3));
-	w3 = _mm256_add_pd(w3, _mm256_mul_pd(a2_3, h_4_2));
-	w3 = _mm256_add_pd(w3, _mm256_mul_pd(a1_3, h_4_1));
-	register __m256d z3 = _mm256_add_pd(a3_3, _mm256_mul_pd(a2_3, h_3_2));
-	z3 = _mm256_add_pd(z3, _mm256_mul_pd(a1_3, h_3_1));
-	register __m256d y3 = _mm256_add_pd(a2_3, _mm256_mul_pd(a1_3, h_2_1));
-	register __m256d x3 = a1_3;
-#endif
-
-	__m256d q1;
-	__m256d q2;
-	__m256d q3;
-
-	__m256d h1;
-	__m256d h2;
-	__m256d h3;
-	__m256d h4;
-
-	for(i = 4; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-3]);
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_FMA_pd(q1, h1, x1);
-		x2 = _mm256_FMA_pd(q2, h1, x2);
-		x3 = _mm256_FMA_pd(q3, h1, x3);
-#else
-		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-		x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-		x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
-#endif
-
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-2]);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm256_FMA_pd(q1, h2, y1);
-		y2 = _mm256_FMA_pd(q2, h2, y2);
-		y3 = _mm256_FMA_pd(q3, h2, y3);
-#else
-		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-		y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-		y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2));
-#endif
-
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]);
-#ifdef __ELPA_USE_FMA__
-		z1 = _mm256_FMA_pd(q1, h3, z1);
-		z2 = _mm256_FMA_pd(q2, h3, z2);
-		z3 = _mm256_FMA_pd(q3, h3, z3);
-#else
-		z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-		z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
-		z3 = _mm256_add_pd(z3, _mm256_mul_pd(q3,h3));
-#endif
-
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]);
-#ifdef __ELPA_USE_FMA__
-		w1 = _mm256_FMA_pd(q1, h4, w1);
-		w2 = _mm256_FMA_pd(q2, h4, w2);
-		w3 = _mm256_FMA_pd(q3, h4, w3);
-#else
-		w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
-		w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4));
-		w3 = _mm256_add_pd(w3, _mm256_mul_pd(q3,h4));
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-	x3 = _mm256_FMA_pd(q3, h1, x3);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-	x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
-#endif
-
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-#ifdef __FMA4_
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-	y2 = _mm256_FMA_pd(q2, h2, y2);
-	y3 = _mm256_FMA_pd(q3, h2, y3);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-	y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-	y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2));
-#endif
-
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMA_pd(q1, h3, z1);
-	z2 = _mm256_FMA_pd(q2, h3, z2);
-	z3 = _mm256_FMA_pd(q3, h3, z3);
-#else
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-	z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
-	z3 = _mm256_add_pd(z3, _mm256_mul_pd(q3,h3));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]);
-	q3 = _mm256_load_pd(&q[((nb+1)*ldq)+8]);
-
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-	x3 = _mm256_FMA_pd(q3, h1, x3);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-	x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
-#endif
-
-	h2 = _mm256_broadcast_sd(&hh[(ldh*1)+nb-1]);
-
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-	y2 = _mm256_FMA_pd(q2, h2, y2);
-	y3 = _mm256_FMA_pd(q3, h2, y3);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-	y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-	y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]);
-	q3 = _mm256_load_pd(&q[((nb+2)*ldq)+8]);
-
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-	x3 = _mm256_FMA_pd(q3, h1, x3);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-	x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [12 x nb+3]
-	/////////////////////////////////////////////////////
-
-	__m256d tau1 = _mm256_broadcast_sd(&hh[0]);
-
-	h1 = tau1;
-	x1 = _mm256_mul_pd(x1, h1);
-	x2 = _mm256_mul_pd(x2, h1);
-	x3 = _mm256_mul_pd(x3, h1);
-
-	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
-	__m256d vs_1_2 = _mm256_broadcast_sd(&s_1_2);
-
-	h1 = tau2;
-	h2 = _mm256_mul_pd(h1, vs_1_2);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMSUB_pd(y1, h1, _mm256_mul_pd(x1,h2));
-	y2 = _mm256_FMSUB_pd(y2, h1, _mm256_mul_pd(x2,h2));
-	y3 = _mm256_FMSUB_pd(y3, h1, _mm256_mul_pd(x3,h2));
-#else
-	y1 = _mm256_sub_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
-	y2 = _mm256_sub_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2));
-	y3 = _mm256_sub_pd(_mm256_mul_pd(y3,h1), _mm256_mul_pd(x3,h2));
-#endif
-
-	__m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]);
-	__m256d vs_1_3 = _mm256_broadcast_sd(&s_1_3);
-	__m256d vs_2_3 = _mm256_broadcast_sd(&s_2_3);
-
-	h1 = tau3;
-	h2 = _mm256_mul_pd(h1, vs_1_3);
-	h3 = _mm256_mul_pd(h1, vs_2_3);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMSUB_pd(z1, h1, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)));
-	z2 = _mm256_FMSUB_pd(z2, h1, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)));
-	z3 = _mm256_FMSUB_pd(z3, h1, _mm256_FMA_pd(y3, h3, _mm256_mul_pd(x3,h2)));
-#else
-	z1 = _mm256_sub_pd(_mm256_mul_pd(z1,h1), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)));
-	z2 = _mm256_sub_pd(_mm256_mul_pd(z2,h1), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)));
-	z3 = _mm256_sub_pd(_mm256_mul_pd(z3,h1), _mm256_add_pd(_mm256_mul_pd(y3,h3), _mm256_mul_pd(x3,h2)));
-#endif
-
-	__m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]);
-	__m256d vs_1_4 = _mm256_broadcast_sd(&s_1_4);
-	__m256d vs_2_4 = _mm256_broadcast_sd(&s_2_4);
-	__m256d vs_3_4 = _mm256_broadcast_sd(&s_3_4);
-
-	h1 = tau4;
-	h2 = _mm256_mul_pd(h1, vs_1_4);
-	h3 = _mm256_mul_pd(h1, vs_2_4);
-	h4 = _mm256_mul_pd(h1, vs_3_4);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm256_FMSUB_pd(w1, h1, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))));
-	w2 = _mm256_FMSUB_pd(w2, h1, _mm256_FMA_pd(z2, h4, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))));
-	w3 = _mm256_FMSUB_pd(w3, h1, _mm256_FMA_pd(z3, h4, _mm256_FMA_pd(y3, h3, _mm256_mul_pd(x3,h2))));
-#else
-	w1 = _mm256_sub_pd(_mm256_mul_pd(w1,h1), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))));
-	w2 = _mm256_sub_pd(_mm256_mul_pd(w2,h1), _mm256_add_pd(_mm256_mul_pd(z2,h4), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))));
-	w3 = _mm256_sub_pd(_mm256_mul_pd(w3,h1), _mm256_add_pd(_mm256_mul_pd(z3,h4), _mm256_add_pd(_mm256_mul_pd(y3,h3), _mm256_mul_pd(x3,h2))));
-#endif
-
-	q1 = _mm256_load_pd(&q[0]);
-	q2 = _mm256_load_pd(&q[4]);
-	q3 = _mm256_load_pd(&q[8]);
-	q1 = _mm256_sub_pd(q1, w1);
-	q2 = _mm256_sub_pd(q2, w2);
-	q3 = _mm256_sub_pd(q3, w3);
-	_mm256_store_pd(&q[0],q1);
-	_mm256_store_pd(&q[4],q2);
-	_mm256_store_pd(&q[8],q3);
-
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	q1 = _mm256_load_pd(&q[ldq]);
-	q2 = _mm256_load_pd(&q[ldq+4]);
-	q3 = _mm256_load_pd(&q[ldq+8]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_sub_pd(q1, _mm256_FMA_pd(w1, h4, z1));
-	q2 = _mm256_sub_pd(q2, _mm256_FMA_pd(w2, h4, z2));
-	q3 = _mm256_sub_pd(q3, _mm256_FMA_pd(w3, h4, z3));
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd(z1, _mm256_mul_pd(w1, h4)));
-	q2 = _mm256_sub_pd(q2, _mm256_add_pd(z2, _mm256_mul_pd(w2, h4)));
-	q3 = _mm256_sub_pd(q3, _mm256_add_pd(z3, _mm256_mul_pd(w3, h4)));
-#endif
-	_mm256_store_pd(&q[ldq],q1);
-	_mm256_store_pd(&q[ldq+4],q2);
-	_mm256_store_pd(&q[ldq+8],q3);
-
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-	q1 = _mm256_load_pd(&q[ldq*2]);
-	q2 = _mm256_load_pd(&q[(ldq*2)+4]);
-	q3 = _mm256_load_pd(&q[(ldq*2)+8]);
-	q1 = _mm256_sub_pd(q1, y1);
-	q2 = _mm256_sub_pd(q2, y2);
-	q3 = _mm256_sub_pd(q3, y3);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-	q2 = _mm256_NFMA_pd(w2, h4, q2);
-	q3 = _mm256_NFMA_pd(w3, h4, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(w3, h4));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-	q2 = _mm256_NFMA_pd(z2, h3, q2);
-	q3 = _mm256_NFMA_pd(z3, h3, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(z3, h3));
-#endif
-	_mm256_store_pd(&q[ldq*2],q1);
-	_mm256_store_pd(&q[(ldq*2)+4],q2);
-	_mm256_store_pd(&q[(ldq*2)+8],q3);
-
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-	q1 = _mm256_load_pd(&q[ldq*3]);
-	q2 = _mm256_load_pd(&q[(ldq*3)+4]);
-	q3 = _mm256_load_pd(&q[(ldq*3)+8]);
-	q1 = _mm256_sub_pd(q1, x1);
-	q2 = _mm256_sub_pd(q2, x2);
-	q3 = _mm256_sub_pd(q3, x3);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-	q2 = _mm256_NFMA_pd(w2, h4, q2);
-	q3 = _mm256_NFMA_pd(w3, h4, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(w3, h4));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-	q2 = _mm256_NFMA_pd(y2, h2, q2);
-	q3 = _mm256_NFMA_pd(y3, h2, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(y3, h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-	q2 = _mm256_NFMA_pd(z2, h3, q2);
-	q3 = _mm256_NFMA_pd(z3, h3, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(z3, h3));
-#endif
-	_mm256_store_pd(&q[ldq*3], q1);
-	_mm256_store_pd(&q[(ldq*3)+4], q2);
-	_mm256_store_pd(&q[(ldq*3)+8], q3);
-
-	for (i = 4; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-3]);
-
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(x1, h1, q1);
-		q2 = _mm256_NFMA_pd(x2, h1, q2);
-		q3 = _mm256_NFMA_pd(x3, h1, q3);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1,h1));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2,h1));
-		q3 = _mm256_sub_pd(q3, _mm256_mul_pd(x3,h1));
-#endif
-
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-2]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(y1, h2, q1);
-		q2 = _mm256_NFMA_pd(y2, h2, q2);
-		q3 = _mm256_NFMA_pd(y3, h2, q3);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1,h2));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2,h2));
-		q3 = _mm256_sub_pd(q3, _mm256_mul_pd(y3,h2));
-#endif
-
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(z1, h3, q1);
-		q2 = _mm256_NFMA_pd(z2, h3, q2);
-		q3 = _mm256_NFMA_pd(z3, h3, q3);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1,h3));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2,h3));
-		q3 = _mm256_sub_pd(q3, _mm256_mul_pd(z3,h3));
-#endif
-
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(w1, h4, q1);
-		q2 = _mm256_NFMA_pd(w2, h4, q2);
-		q3 = _mm256_NFMA_pd(w3, h4, q3);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1,h4));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2,h4));
-		q3 = _mm256_sub_pd(q3, _mm256_mul_pd(w3,h4));
-#endif
-
-		_mm256_store_pd(&q[i*ldq],q1);
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-		_mm256_store_pd(&q[(i*ldq)+8],q3);
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-	q2 = _mm256_NFMA_pd(x2, h1, q2);
-	q3 = _mm256_NFMA_pd(x3, h1, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1,h1));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2,h1));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(x3,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-	q2 = _mm256_NFMA_pd(y2, h2, q2);
-	q3 = _mm256_NFMA_pd(y3, h2, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1,h2));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2,h2));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(y3,h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-	q2 = _mm256_NFMA_pd(z2, h3, q2);
-	q3 = _mm256_NFMA_pd(z3, h3, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1,h3));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2,h3));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(z3,h3));
-#endif
-	_mm256_store_pd(&q[nb*ldq],q1);
-	_mm256_store_pd(&q[(nb*ldq)+4],q2);
-	_mm256_store_pd(&q[(nb*ldq)+8],q3);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]);
-	q3 = _mm256_load_pd(&q[((nb+1)*ldq)+8]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-	q2 = _mm256_NFMA_pd(x2, h1, q2);
-	q3 = _mm256_NFMA_pd(x3, h1, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1,h1));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2,h1));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(x3,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-	q2 = _mm256_NFMA_pd(y2, h2, q2);
-	q3 = _mm256_NFMA_pd(y3, h2, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1,h2));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2,h2));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(y3,h2));
-#endif
-	_mm256_store_pd(&q[(nb+1)*ldq],q1);
-	_mm256_store_pd(&q[((nb+1)*ldq)+4],q2);
-	_mm256_store_pd(&q[((nb+1)*ldq)+8],q3);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]);
-	q3 = _mm256_load_pd(&q[((nb+2)*ldq)+8]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-	q2 = _mm256_NFMA_pd(x2, h1, q2);
-	q3 = _mm256_NFMA_pd(x3, h1, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1,h1));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2,h1));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(x3,h1));
-#endif
-	_mm256_store_pd(&q[(nb+2)*ldq],q1);
-	_mm256_store_pd(&q[((nb+2)*ldq)+4],q2);
-	_mm256_store_pd(&q[((nb+2)*ldq)+8],q3);
-}
-
-/**
- * Unrolled kernel that computes
- * 8 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_8_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [4 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m256d a1_1 = _mm256_load_pd(&q[ldq*3]);
-	__m256d a2_1 = _mm256_load_pd(&q[ldq*2]);
-	__m256d a3_1 = _mm256_load_pd(&q[ldq]);
-	__m256d a4_1 = _mm256_load_pd(&q[0]);
-
-	__m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]);
-	__m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-	__m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-	__m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	__m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-	__m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-
-#ifdef __ELPA_USE_FMA__
-	__m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1);
-	w1 = _mm256_FMA_pd(a2_1, h_4_2, w1);
-	w1 = _mm256_FMA_pd(a1_1, h_4_1, w1);
-	__m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1);
-	z1 = _mm256_FMA_pd(a1_1, h_3_1, z1);
-	__m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1);
-	__m256d x1 = a1_1;
-#else
-	__m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1));
-	__m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2));
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1));
-	__m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1));
-	__m256d x1 = a1_1;
-#endif
-
-	__m256d a1_2 = _mm256_load_pd(&q[(ldq*3)+4]);
-	__m256d a2_2 = _mm256_load_pd(&q[(ldq*2)+4]);
-	__m256d a3_2 = _mm256_load_pd(&q[ldq+4]);
-	__m256d a4_2 = _mm256_load_pd(&q[0+4]);
-
-#ifdef __ELPA_USE_FMA__
-	__m256d w2 = _mm256_FMA_pd(a3_2, h_4_3, a4_2);
-	w2 = _mm256_FMA_pd(a2_2, h_4_2, w2);
-	w2 = _mm256_FMA_pd(a1_2, h_4_1, w2);
-	__m256d z2 = _mm256_FMA_pd(a2_2, h_3_2, a3_2);
-	z2 = _mm256_FMA_pd(a1_2, h_3_1, z2);
-	__m256d y2 = _mm256_FMA_pd(a1_2, h_2_1, a2_2);
-	__m256d x2 = a1_2;
-#else
-	__m256d w2 = _mm256_add_pd(a4_2, _mm256_mul_pd(a3_2, h_4_3));
-	w2 = _mm256_add_pd(w2, _mm256_mul_pd(a2_2, h_4_2));
-	w2 = _mm256_add_pd(w2, _mm256_mul_pd(a1_2, h_4_1));
-	__m256d z2 = _mm256_add_pd(a3_2, _mm256_mul_pd(a2_2, h_3_2));
-	z2 = _mm256_add_pd(z2, _mm256_mul_pd(a1_2, h_3_1));
-	__m256d y2 = _mm256_add_pd(a2_2, _mm256_mul_pd(a1_2, h_2_1));
-	__m256d x2 = a1_2;
-#endif
-
-	__m256d q1;
-	__m256d q2;
-
-	__m256d h1;
-	__m256d h2;
-	__m256d h3;
-	__m256d h4;
-
-	for(i = 4; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-3]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-2]);
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]);
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]);
-
-		q1 = _mm256_load_pd(&q[i*ldq]);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_FMA_pd(q1, h1, x1);
-		y1 = _mm256_FMA_pd(q1, h2, y1);
-		z1 = _mm256_FMA_pd(q1, h3, z1);
-		w1 = _mm256_FMA_pd(q1, h4, w1);
-#else
-		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-		z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-		w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
-#endif
-
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm256_FMA_pd(q2, h1, x2);
-		y2 = _mm256_FMA_pd(q2, h2, y2);
-		z2 = _mm256_FMA_pd(q2, h3, z2);
-		w2 = _mm256_FMA_pd(q2, h4, w2);
-#else
-		x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-		y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-		z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
-		w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4));
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-	y2 = _mm256_FMA_pd(q2, h2, y2);
-	z1 = _mm256_FMA_pd(q1, h3, z1);
-	z2 = _mm256_FMA_pd(q2, h3, z2);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-	y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-	z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-	h2 = _mm256_broadcast_sd(&hh[(ldh*1)+nb-1]);
-
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]);
-
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-	y2 = _mm256_FMA_pd(q2, h2, y2);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-	y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]);
-
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [8 x nb+3]
-	/////////////////////////////////////////////////////
-
-	__m256d tau1 = _mm256_broadcast_sd(&hh[0]);
-	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
-	__m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]);
-	__m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]);
-
-	__m256d vs_1_2 = _mm256_broadcast_sd(&s_1_2);
-	__m256d vs_1_3 = _mm256_broadcast_sd(&s_1_3);
-	__m256d vs_2_3 = _mm256_broadcast_sd(&s_2_3);
-	__m256d vs_1_4 = _mm256_broadcast_sd(&s_1_4);
-	__m256d vs_2_4 = _mm256_broadcast_sd(&s_2_4);
-	__m256d vs_3_4 = _mm256_broadcast_sd(&s_3_4);
-
-	h1 = tau1;
-	x1 = _mm256_mul_pd(x1, h1);
-	x2 = _mm256_mul_pd(x2, h1);
-
-	h1 = tau2;
-	h2 = _mm256_mul_pd(h1, vs_1_2);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMSUB_pd(y1, h1, _mm256_mul_pd(x1,h2));
-	y2 = _mm256_FMSUB_pd(y2, h1, _mm256_mul_pd(x2,h2));
-#else
-	y1 = _mm256_sub_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
-	y2 = _mm256_sub_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2));
-#endif
-
-	h1 = tau3;
-	h2 = _mm256_mul_pd(h1, vs_1_3);
-	h3 = _mm256_mul_pd(h1, vs_2_3);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMSUB_pd(z1, h1, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)));
-	z2 = _mm256_FMSUB_pd(z2, h1, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)));
-#else
-	z1 = _mm256_sub_pd(_mm256_mul_pd(z1,h1), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)));
-	z2 = _mm256_sub_pd(_mm256_mul_pd(z2,h1), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)));
-#endif
-
-	h1 = tau4;
-	h2 = _mm256_mul_pd(h1, vs_1_4);
-	h3 = _mm256_mul_pd(h1, vs_2_4);
-	h4 = _mm256_mul_pd(h1, vs_3_4);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm256_FMSUB_pd(w1, h1, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))));
-	w2 = _mm256_FMSUB_pd(w2, h1, _mm256_FMA_pd(z2, h4, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))));
-#else
-	w1 = _mm256_sub_pd(_mm256_mul_pd(w1,h1), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))));
-	w2 = _mm256_sub_pd(_mm256_mul_pd(w2,h1), _mm256_add_pd(_mm256_mul_pd(z2,h4), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))));
-#endif
-
-	q1 = _mm256_load_pd(&q[0]);
-	q2 = _mm256_load_pd(&q[4]);
-	q1 = _mm256_sub_pd(q1, w1);
-	q2 = _mm256_sub_pd(q2, w2);
-	_mm256_store_pd(&q[0],q1);
-	_mm256_store_pd(&q[4],q2);
-
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	q1 = _mm256_load_pd(&q[ldq]);
-	q2 = _mm256_load_pd(&q[ldq+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_sub_pd(q1, _mm256_FMA_pd(w1, h4, z1));
-	q2 = _mm256_sub_pd(q2, _mm256_FMA_pd(w2, h4, z2));
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd(z1, _mm256_mul_pd(w1, h4)));
-	q2 = _mm256_sub_pd(q2, _mm256_add_pd(z2, _mm256_mul_pd(w2, h4)));
-#endif
-	_mm256_store_pd(&q[ldq],q1);
-	_mm256_store_pd(&q[ldq+4],q2);
-
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-	q1 = _mm256_load_pd(&q[ldq*2]);
-	q2 = _mm256_load_pd(&q[(ldq*2)+4]);
-#ifdef __ELPA_USE_FMA__
-        q1 = _mm256_sub_pd(q1, y1);
-        q1 = _mm256_NFMA_pd(z1, h3, q1);
-        q1 = _mm256_NFMA_pd(w1, h4, q1);
-        q2 = _mm256_sub_pd(q2, y2);
-        q2 = _mm256_NFMA_pd(z2, h3, q2);
-        q2 = _mm256_NFMA_pd(w2, h4, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd(y1, _mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(w1, h4))));
-	q2 = _mm256_sub_pd(q2, _mm256_add_pd(y2, _mm256_add_pd(_mm256_mul_pd(z2, h3), _mm256_mul_pd(w2, h4))));
-#endif
-	_mm256_store_pd(&q[ldq*2],q1);
-	_mm256_store_pd(&q[(ldq*2)+4],q2);
-
-	h2 = _mm256_broadcast_sd(&hh[ldh+1]);
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-	q1 = _mm256_load_pd(&q[ldq*3]);
-	q2 = _mm256_load_pd(&q[(ldq*3)+4]);
-#ifdef __ELPA_USE_FMA__
-        q1 = _mm256_sub_pd(q1, x1);
-        q1 = _mm256_NFMA_pd(y1, h2, q1);
-        q1 = _mm256_NFMA_pd(z1, h3, q1);
-        q1 = _mm256_NFMA_pd(w1, h4, q1);
-        q2 = _mm256_sub_pd(q2, x2);
-        q2 = _mm256_NFMA_pd(y2, h2, q2);
-        q2 = _mm256_NFMA_pd(z2, h3, q2);
-        q2 = _mm256_NFMA_pd(w2, h4, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd(x1, _mm256_add_pd(_mm256_mul_pd(y1, h2), _mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(w1, h4)))));
-	q2 = _mm256_sub_pd(q2, _mm256_add_pd(x2, _mm256_add_pd(_mm256_mul_pd(y2, h2), _mm256_add_pd(_mm256_mul_pd(z2, h3), _mm256_mul_pd(w2, h4)))));
-#endif
-	_mm256_store_pd(&q[ldq*3], q1);
-	_mm256_store_pd(&q[(ldq*3)+4], q2);
-
-	for (i = 4; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-3]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-2]);
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]);
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]);
-
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-                q1 = _mm256_NFMA_pd(x1, h1, q1);
-                q1 = _mm256_NFMA_pd(y1, h2, q1);
-                q1 = _mm256_NFMA_pd(z1, h3, q1);
-                q1 = _mm256_NFMA_pd(w1, h4, q1);
-                q2 = _mm256_NFMA_pd(x2, h1, q2);
-                q2 = _mm256_NFMA_pd(y2, h2, q2);
-                q2 = _mm256_NFMA_pd(z2, h3, q2);
-                q2 = _mm256_NFMA_pd(w2, h4, q2);
-		_mm256_store_pd(&q[i*ldq],q1);
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-#else
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q1 = _mm256_sub_pd(q1, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1, h4), _mm256_mul_pd(z1, h3)), _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2))));
-		_mm256_store_pd(&q[i*ldq],q1);
-
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		q2 = _mm256_sub_pd(q2, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w2, h4), _mm256_mul_pd(z2, h3)), _mm256_add_pd(_mm256_mul_pd(x2,h1), _mm256_mul_pd(y2, h2))));
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-        q1 = _mm256_NFMA_pd(x1, h1, q1);
-        q1 = _mm256_NFMA_pd(y1, h2, q1);
-        q1 = _mm256_NFMA_pd(z1, h3, q1);
-        q2 = _mm256_NFMA_pd(x2, h1, q2);
-        q2 = _mm256_NFMA_pd(y2, h2, q2);
-        q2 = _mm256_NFMA_pd(z2, h3, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(y1, h2)) , _mm256_mul_pd(x1, h1)));
-	q2 = _mm256_sub_pd(q2, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(z2, h3), _mm256_mul_pd(y2, h2)) , _mm256_mul_pd(x2, h1)));
-#endif
-	_mm256_store_pd(&q[nb*ldq],q1);
-	_mm256_store_pd(&q[(nb*ldq)+4],q2);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-        q1 = _mm256_NFMA_pd(x1, h1, q1);
-        q1 = _mm256_NFMA_pd(y1, h2, q1);
-        q2 = _mm256_NFMA_pd(x2, h1, q2);
-        q2 = _mm256_NFMA_pd(y2, h2, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd( _mm256_mul_pd(y1, h2) , _mm256_mul_pd(x1, h1)));
-	q2 = _mm256_sub_pd(q2, _mm256_add_pd( _mm256_mul_pd(y2, h2) , _mm256_mul_pd(x2, h1)));
-#endif
-	_mm256_store_pd(&q[(nb+1)*ldq],q1);
-	_mm256_store_pd(&q[((nb+1)*ldq)+4],q2);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-	q2 = _mm256_NFMA_pd(x2, h1, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1));
-#endif
-	_mm256_store_pd(&q[(nb+2)*ldq],q1);
-	_mm256_store_pd(&q[((nb+2)*ldq)+4],q2);
-}
-
-/**
- * Unrolled kernel that computes
- * 4 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_4_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [4 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m256d a1_1 = _mm256_load_pd(&q[ldq*3]);
-	__m256d a2_1 = _mm256_load_pd(&q[ldq*2]);
-	__m256d a3_1 = _mm256_load_pd(&q[ldq]);
-	__m256d a4_1 = _mm256_load_pd(&q[0]);
-
-	__m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]);
-	__m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-	__m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-	__m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	__m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-	__m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-
-#ifdef __ELPA_USE_FMA__
-	__m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1);
-	w1 = _mm256_FMA_pd(a2_1, h_4_2, w1);
-	w1 = _mm256_FMA_pd(a1_1, h_4_1, w1);
-	__m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1);
-	z1 = _mm256_FMA_pd(a1_1, h_3_1, z1);
-	__m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1);
-	__m256d x1 = a1_1;
-#else
-	__m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1));
-	__m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2));
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1));
-	__m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1));
-	__m256d x1 = a1_1;
-#endif
-
-	__m256d q1;
-
-	__m256d h1;
-	__m256d h2;
-	__m256d h3;
-	__m256d h4;
-
-	for(i = 4; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-3]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-2]);
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]);
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]);
-
-		q1 = _mm256_load_pd(&q[i*ldq]);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_FMA_pd(q1, h1, x1);
-		y1 = _mm256_FMA_pd(q1, h2, y1);
-		z1 = _mm256_FMA_pd(q1, h3, z1);
-		w1 = _mm256_FMA_pd(q1, h4, w1);
-#else
-		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-		z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-		w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-#ifdef _FMA4__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-	z1 = _mm256_FMA_pd(q1, h3, z1);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-	h2 = _mm256_broadcast_sd(&hh[(ldh*1)+nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [4 x nb+3]
-	/////////////////////////////////////////////////////
-
-	__m256d tau1 = _mm256_broadcast_sd(&hh[0]);
-	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
-	__m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]);
-	__m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]);
-
-	__m256d vs_1_2 = _mm256_broadcast_sd(&s_1_2);
-	__m256d vs_1_3 = _mm256_broadcast_sd(&s_1_3);
-	__m256d vs_2_3 = _mm256_broadcast_sd(&s_2_3);
-	__m256d vs_1_4 = _mm256_broadcast_sd(&s_1_4);
-	__m256d vs_2_4 = _mm256_broadcast_sd(&s_2_4);
-	__m256d vs_3_4 = _mm256_broadcast_sd(&s_3_4);
-
-	h1 = tau1;
-	x1 = _mm256_mul_pd(x1, h1);
-
-	h1 = tau2;
-	h2 = _mm256_mul_pd(h1, vs_1_2);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMSUB_pd(y1, h1, _mm256_mul_pd(x1,h2));
-#else
-	y1 = _mm256_sub_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
-#endif
-
-	h1 = tau3;
-	h2 = _mm256_mul_pd(h1, vs_1_3);
-	h3 = _mm256_mul_pd(h1, vs_2_3);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMSUB_pd(z1, h1, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)));
-#else
-	z1 = _mm256_sub_pd(_mm256_mul_pd(z1,h1), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)));
-#endif
-
-	h1 = tau4;
-	h2 = _mm256_mul_pd(h1, vs_1_4);
-	h3 = _mm256_mul_pd(h1, vs_2_4);
-	h4 = _mm256_mul_pd(h1, vs_3_4);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm256_FMSUB_pd(w1, h1, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))));
-#else
-	w1 = _mm256_sub_pd(_mm256_mul_pd(w1,h1), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))));
-#endif
-
-	q1 = _mm256_load_pd(&q[0]);
-	q1 = _mm256_sub_pd(q1, w1);
-	_mm256_store_pd(&q[0],q1);
-
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	q1 = _mm256_load_pd(&q[ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_sub_pd(q1, _mm256_FMA_pd(w1, h4, z1));
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd(z1, _mm256_mul_pd(w1, h4)));
-#endif
-	_mm256_store_pd(&q[ldq],q1);
-
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-	q1 = _mm256_load_pd(&q[ldq*2]);
-#ifdef __ELPA_USE_FMA__
-        q1 = _mm256_sub_pd(q1, y1);
-        q1 = _mm256_NFMA_pd(z1, h3, q1);
-        q1 = _mm256_NFMA_pd(w1, h4, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd(y1, _mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(w1, h4))));
-#endif
-	_mm256_store_pd(&q[ldq*2],q1);
-
-	h2 = _mm256_broadcast_sd(&hh[ldh+1]);
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-	q1 = _mm256_load_pd(&q[ldq*3]);
-#ifdef __ELPA_USE_FMA__
-        q1 = _mm256_sub_pd(q1, x1);
-        q1 = _mm256_NFMA_pd(y1, h2, q1);
-        q1 = _mm256_NFMA_pd(z1, h3, q1);
-        q1 = _mm256_NFMA_pd(w1, h4, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd(x1, _mm256_add_pd(_mm256_mul_pd(y1, h2), _mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(w1, h4)))));
-#endif
-	_mm256_store_pd(&q[ldq*3], q1);
-
-	for (i = 4; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-3]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-2]);
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]);
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]);
-
-		q1 = _mm256_load_pd(&q[i*ldq]);
-#ifdef __ELPA_USE_FMA__
-                q1 = _mm256_NFMA_pd(x1, h1, q1);
-                q1 = _mm256_NFMA_pd(y1, h2, q1);
-                q1 = _mm256_NFMA_pd(z1, h3, q1);
-                q1 = _mm256_NFMA_pd(w1, h4, q1);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1, h4), _mm256_mul_pd(z1, h3)), _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2))));
-#endif
-		_mm256_store_pd(&q[i*ldq],q1);
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-#ifdef __ELPA_USE_FMA__
-        q1 = _mm256_NFMA_pd(x1, h1, q1);
-        q1 = _mm256_NFMA_pd(y1, h2, q1);
-        q1 = _mm256_NFMA_pd(z1, h3, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(y1, h2)) , _mm256_mul_pd(x1, h1)));
-#endif
-	_mm256_store_pd(&q[nb*ldq],q1);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-#ifdef __ELPA_USE_FMA__
-        q1 = _mm256_NFMA_pd(x1, h1, q1);
-        q1 = _mm256_NFMA_pd(y1, h2, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd( _mm256_mul_pd(y1, h2) , _mm256_mul_pd(x1, h1)));
-#endif
-	_mm256_store_pd(&q[(nb+1)*ldq],q1);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-#endif
-	_mm256_store_pd(&q[(nb+2)*ldq],q1);
-}
-#else
-/**
- * Unrolled kernel that computes
- * 6 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [6 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m128d a1_1 = _mm_load_pd(&q[ldq*3]);
-	__m128d a2_1 = _mm_load_pd(&q[ldq*2]);
-	__m128d a3_1 = _mm_load_pd(&q[ldq]);
-	__m128d a4_1 = _mm_load_pd(&q[0]);
-
-	__m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]);
-	__m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-	__m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-	__m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	__m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-	__m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-
-#ifdef __ELPA_USE_FMA__
-	register __m128d w1 = _mm_macc_pd(a3_1, h_4_3, a4_1);
-	w1 = _mm_macc_pd(a2_1, h_4_2, w1);
-	w1 = _mm_macc_pd(a1_1, h_4_1, w1);
-	register __m128d z1 = _mm_macc_pd(a2_1, h_3_2, a3_1);
-	z1 = _mm_macc_pd(a1_1, h_3_1, z1);
-	register __m128d y1 = _mm_macc_pd(a1_1, h_2_1, a2_1);
-	register __m128d x1 = a1_1;
-#else
-	register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1));
-	register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2));
-	z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1));
-	register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1));
-	register __m128d x1 = a1_1;
-#endif
-
-	__m128d a1_2 = _mm_load_pd(&q[(ldq*3)+2]);
-	__m128d a2_2 = _mm_load_pd(&q[(ldq*2)+2]);
-	__m128d a3_2 = _mm_load_pd(&q[ldq+2]);
-	__m128d a4_2 = _mm_load_pd(&q[0+2]);
-
-#ifdef __ELPA_USE_FMA__
-	register __m128d w2 = _mm_macc_pd(a3_2, h_4_3, a4_2);
-	w2 = _mm_macc_pd(a2_2, h_4_2, w2);
-	w2 = _mm_macc_pd(a1_2, h_4_1, w2);
-	register __m128d z2 = _mm_macc_pd(a2_2, h_3_2, a3_2);
-	z2 = _mm_macc_pd(a1_2, h_3_1, z2);
-	register __m128d y2 = _mm_macc_pd(a1_2, h_2_1, a2_2);
-	register __m128d x2 = a1_2;
-#else
-	register __m128d w2 = _mm_add_pd(a4_2, _mm_mul_pd(a3_2, h_4_3));
-	w2 = _mm_add_pd(w2, _mm_mul_pd(a2_2, h_4_2));
-	w2 = _mm_add_pd(w2, _mm_mul_pd(a1_2, h_4_1));
-	register __m128d z2 = _mm_add_pd(a3_2, _mm_mul_pd(a2_2, h_3_2));
-	z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1));
-	register __m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1));
-	register __m128d x2 = a1_2;
-#endif
-
-	__m128d a1_3 = _mm_load_pd(&q[(ldq*3)+4]);
-	__m128d a2_3 = _mm_load_pd(&q[(ldq*2)+4]);
-	__m128d a3_3 = _mm_load_pd(&q[ldq+4]);
-	__m128d a4_3 = _mm_load_pd(&q[0+4]);
-
-#ifdef __ELPA_USE_FMA__
-	register __m128d w3 = _mm_macc_pd(a3_3, h_4_3, a4_3);
-	w3 = _mm_macc_pd(a2_3, h_4_2, w3);
-	w3 = _mm_macc_pd(a1_3, h_4_1, w3);
-	register __m128d z3 = _mm_macc_pd(a2_3, h_3_2, a3_3);
-	z3 = _mm_macc_pd(a1_3, h_3_1, z3);
-	register __m128d y3 = _mm_macc_pd(a1_3, h_2_1, a2_3);
-	register __m128d x3 = a1_3;
-#else
-	register __m128d w3 = _mm_add_pd(a4_3, _mm_mul_pd(a3_3, h_4_3));
-	w3 = _mm_add_pd(w3, _mm_mul_pd(a2_3, h_4_2));
-	w3 = _mm_add_pd(w3, _mm_mul_pd(a1_3, h_4_1));
-	register __m128d z3 = _mm_add_pd(a3_3, _mm_mul_pd(a2_3, h_3_2));
-	z3 = _mm_add_pd(z3, _mm_mul_pd(a1_3, h_3_1));
-	register __m128d y3 = _mm_add_pd(a2_3, _mm_mul_pd(a1_3, h_2_1));
-	register __m128d x3 = a1_3;
-#endif
-
-	__m128d q1;
-	__m128d q2;
-	__m128d q3;
-
-	__m128d h1;
-	__m128d h2;
-	__m128d h3;
-	__m128d h4;
-
-	for(i = 4; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-3]);
-		q1 = _mm_load_pd(&q[i*ldq]);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		q3 = _mm_load_pd(&q[(i*ldq)+4]);
-
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_macc_pd(q1, h1, x1);
-		x2 = _mm_macc_pd(q2, h1, x2);
-		x3 = _mm_macc_pd(q3, h1, x3);
-#else
-		x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-		x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-		x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1));
-#endif
-
-		h2 = _mm_loaddup_pd(&hh[ldh+i-2]);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm_macc_pd(q1, h2, y1);
-		y2 = _mm_macc_pd(q2, h2, y2);
-		y3 = _mm_macc_pd(q3, h2, y3);
-#else
-		y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-		y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-		y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2));
-#endif
-
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]);
-#ifdef __ELPA_USE_FMA__
-		z1 = _mm_macc_pd(q1, h3, z1);
-		z2 = _mm_macc_pd(q2, h3, z2);
-		z3 = _mm_macc_pd(q3, h3, z3);
-#else
-		z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-		z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
-		z3 = _mm_add_pd(z3, _mm_mul_pd(q3,h3));
-#endif
-
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]);
-#ifdef __ELPA_USE_FMA__
-		w1 = _mm_macc_pd(q1, h4, w1);
-		w2 = _mm_macc_pd(q2, h4, w2);
-		w3 = _mm_macc_pd(q3, h4, w3);
-#else
-		w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
-		w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4));
-		w3 = _mm_add_pd(w3, _mm_mul_pd(q3,h4));
-#endif
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	q3 = _mm_load_pd(&q[(nb*ldq)+4]);
-
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-	x2 = _mm_macc_pd(q2, h1, x2);
-	x3 = _mm_macc_pd(q3, h1, x3);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-	x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1));
-#endif
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_macc_pd(q1, h2, y1);
-	y2 = _mm_macc_pd(q2, h2, y2);
-	y3 = _mm_macc_pd(q3, h2, y3);
-#else
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-	y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2));
-#endif
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm_macc_pd(q1, h3, z1);
-	z2 = _mm_macc_pd(q2, h3, z2);
-	z3 = _mm_macc_pd(q3, h3, z3);
-#else
-	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-	z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
-	z3 = _mm_add_pd(z3, _mm_mul_pd(q3,h3));
-#endif
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]);
-	q3 = _mm_load_pd(&q[((nb+1)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-	x2 = _mm_macc_pd(q2, h1, x2);
-	x3 = _mm_macc_pd(q3, h1, x3);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-	x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1));
-#endif
-
-	h2 = _mm_loaddup_pd(&hh[(ldh*1)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_macc_pd(q1, h2, y1);
-	y2 = _mm_macc_pd(q2, h2, y2);
-	y3 = _mm_macc_pd(q3, h2, y3);
-#else
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-	y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2));
-#endif
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]);
-	q3 = _mm_load_pd(&q[((nb+2)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-	x2 = _mm_macc_pd(q2, h1, x2);
-	x3 = _mm_macc_pd(q3, h1, x3);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-	x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [6 x nb+3]
-	/////////////////////////////////////////////////////
-
-	__m128d tau1 = _mm_loaddup_pd(&hh[0]);
-
-	h1 = tau1;
-	x1 = _mm_mul_pd(x1, h1);
-	x2 = _mm_mul_pd(x2, h1);
-	x3 = _mm_mul_pd(x3, h1);
-
-	__m128d tau2 = _mm_loaddup_pd(&hh[ldh]);
-	__m128d vs_1_2 = _mm_loaddup_pd(&s_1_2);
-
-	h1 = tau2;
-	h2 = _mm_mul_pd(h1, vs_1_2);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_msub_pd(y1, h1, _mm_mul_pd(x1,h2));
-	y2 = _mm_msub_pd(y2, h1, _mm_mul_pd(x2,h2));
-	y3 = _mm_msub_pd(y3, h1, _mm_mul_pd(x3,h2));
-#else
-	y1 = _mm_sub_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2));
-	y2 = _mm_sub_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2));
-	y3 = _mm_sub_pd(_mm_mul_pd(y3,h1), _mm_mul_pd(x3,h2));
-#endif
-
-	__m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]);
-	__m128d vs_1_3 = _mm_loaddup_pd(&s_1_3);
-	__m128d vs_2_3 = _mm_loaddup_pd(&s_2_3);
-
-	h1 = tau3;
-	h2 = _mm_mul_pd(h1, vs_1_3);
-	h3 = _mm_mul_pd(h1, vs_2_3);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm_msub_pd(z1, h1, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)));
-	z2 = _mm_msub_pd(z2, h1, _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2)));
-	z3 = _mm_msub_pd(z3, h1, _mm_macc_pd(y3, h3, _mm_mul_pd(x3,h2)));
-#else
-	z1 = _mm_sub_pd(_mm_mul_pd(z1,h1), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)));
-	z2 = _mm_sub_pd(_mm_mul_pd(z2,h1), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)));
-	z3 = _mm_sub_pd(_mm_mul_pd(z3,h1), _mm_add_pd(_mm_mul_pd(y3,h3), _mm_mul_pd(x3,h2)));
-#endif
-
-	__m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]);
-	__m128d vs_1_4 = _mm_loaddup_pd(&s_1_4);
-	__m128d vs_2_4 = _mm_loaddup_pd(&s_2_4);
-	__m128d vs_3_4 = _mm_loaddup_pd(&s_3_4);
-
-	h1 = tau4;
-	h2 = _mm_mul_pd(h1, vs_1_4);
-	h3 = _mm_mul_pd(h1, vs_2_4);
-	h4 = _mm_mul_pd(h1, vs_3_4);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm_msub_pd(w1, h1, _mm_macc_pd(z1, h4, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))));
-	w2 = _mm_msub_pd(w2, h1, _mm_macc_pd(z2, h4, _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2))));
-	w3 = _mm_msub_pd(w3, h1, _mm_macc_pd(z3, h4, _mm_macc_pd(y3, h3, _mm_mul_pd(x3,h2))));
-#else
-	w1 = _mm_sub_pd(_mm_mul_pd(w1,h1), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))));
-	w2 = _mm_sub_pd(_mm_mul_pd(w2,h1), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))));
-	w3 = _mm_sub_pd(_mm_mul_pd(w3,h1), _mm_add_pd(_mm_mul_pd(z3,h4), _mm_add_pd(_mm_mul_pd(y3,h3), _mm_mul_pd(x3,h2))));
-#endif
-
-	q1 = _mm_load_pd(&q[0]);
-	q2 = _mm_load_pd(&q[2]);
-	q3 = _mm_load_pd(&q[4]);
-	q1 = _mm_sub_pd(q1, w1);
-	q2 = _mm_sub_pd(q2, w2);
-	q3 = _mm_sub_pd(q3, w3);
-	_mm_store_pd(&q[0],q1);
-	_mm_store_pd(&q[2],q2);
-	_mm_store_pd(&q[4],q3);
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	q1 = _mm_load_pd(&q[ldq]);
-	q2 = _mm_load_pd(&q[ldq+2]);
-	q3 = _mm_load_pd(&q[ldq+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_sub_pd(q1, _mm_macc_pd(w1, h4, z1));
-	q2 = _mm_sub_pd(q2, _mm_macc_pd(w2, h4, z2));
-	q3 = _mm_sub_pd(q3, _mm_macc_pd(w3, h4, z3));
-#else
-	q1 = _mm_sub_pd(q1, _mm_add_pd(z1, _mm_mul_pd(w1, h4)));
-	q2 = _mm_sub_pd(q2, _mm_add_pd(z2, _mm_mul_pd(w2, h4)));
-	q3 = _mm_sub_pd(q3, _mm_add_pd(z3, _mm_mul_pd(w3, h4)));
-#endif
-	_mm_store_pd(&q[ldq],q1);
-	_mm_store_pd(&q[ldq+2],q2);
-	_mm_store_pd(&q[ldq+4],q3);
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-	q1 = _mm_load_pd(&q[ldq*2]);
-	q2 = _mm_load_pd(&q[(ldq*2)+2]);
-	q3 = _mm_load_pd(&q[(ldq*2)+4]);
-	q1 = _mm_sub_pd(q1, y1);
-	q2 = _mm_sub_pd(q2, y2);
-	q3 = _mm_sub_pd(q3, y3);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(w1, h4, q1);
-	q2 = _mm_nmacc_pd(w2, h4, q2);
-	q3 = _mm_nmacc_pd(w3, h4, q3);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(w3, h4));
-#endif
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(z1, h3, q1);
-	q2 = _mm_nmacc_pd(z2, h3, q2);
-	q3 = _mm_nmacc_pd(z3, h3, q3);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(z3, h3));
-#endif
-	_mm_store_pd(&q[ldq*2],q1);
-	_mm_store_pd(&q[(ldq*2)+2],q2);
-	_mm_store_pd(&q[(ldq*2)+4],q3);
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-	q1 = _mm_load_pd(&q[ldq*3]);
-	q2 = _mm_load_pd(&q[(ldq*3)+2]);
-	q3 = _mm_load_pd(&q[(ldq*3)+4]);
-	q1 = _mm_sub_pd(q1, x1);
-	q2 = _mm_sub_pd(q2, x2);
-	q3 = _mm_sub_pd(q3, x3);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(w1, h4, q1);
-	q2 = _mm_nmacc_pd(w2, h4, q2);
-	q3 = _mm_nmacc_pd(w3, h4, q3);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(w3, h4));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(y1, h2, q1);
-	q2 = _mm_nmacc_pd(y2, h2, q2);
-	q3 = _mm_nmacc_pd(y3, h2, q3);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(y3, h2));
-#endif
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(z1, h3, q1);
-	q2 = _mm_nmacc_pd(z2, h3, q2);
-	q3 = _mm_nmacc_pd(z3, h3, q3);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(z3, h3));
-#endif
-	_mm_store_pd(&q[ldq*3], q1);
-	_mm_store_pd(&q[(ldq*3)+2], q2);
-	_mm_store_pd(&q[(ldq*3)+4], q3);
-
-	for (i = 4; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-3]);
-
-		q1 = _mm_load_pd(&q[i*ldq]);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		q3 = _mm_load_pd(&q[(i*ldq)+4]);
-
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_nmacc_pd(x1, h1, q1);
-		q2 = _mm_nmacc_pd(x2, h1, q2);
-		q3 = _mm_nmacc_pd(x3, h1, q3);
-#else
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(x1,h1));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(x2,h1));
-		q3 = _mm_sub_pd(q3, _mm_mul_pd(x3,h1));
-#endif
-
-		h2 = _mm_loaddup_pd(&hh[ldh+i-2]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_nmacc_pd(y1, h2, q1);
-		q2 = _mm_nmacc_pd(y2, h2, q2);
-		q3 = _mm_nmacc_pd(y3, h2, q3);
-#else
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(y1,h2));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(y2,h2));
-		q3 = _mm_sub_pd(q3, _mm_mul_pd(y3,h2));
-#endif
-
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_nmacc_pd(z1, h3, q1);
-		q2 = _mm_nmacc_pd(z2, h3, q2);
-		q3 = _mm_nmacc_pd(z3, h3, q3);
-#else
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(z1,h3));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(z2,h3));
-		q3 = _mm_sub_pd(q3, _mm_mul_pd(z3,h3));
-#endif
-
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_nmacc_pd(w1, h4, q1);
-		q2 = _mm_nmacc_pd(w2, h4, q2);
-		q3 = _mm_nmacc_pd(w3, h4, q3);
-#else
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(w1,h4));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(w2,h4));
-		q3 = _mm_sub_pd(q3, _mm_mul_pd(w3,h4));
-#endif
-
-		_mm_store_pd(&q[i*ldq],q1);
-		_mm_store_pd(&q[(i*ldq)+2],q2);
-		_mm_store_pd(&q[(i*ldq)+4],q3);
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	q3 = _mm_load_pd(&q[(nb*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(x1, h1, q1);
-	q2 = _mm_nmacc_pd(x2, h1, q2);
-	q3 = _mm_nmacc_pd(x3, h1, q3);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(x3, h1));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(y1, h2, q1);
-	q2 = _mm_nmacc_pd(y2, h2, q2);
-	q3 = _mm_nmacc_pd(y3, h2, q3);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(y3, h2));
-#endif
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(z1, h3, q1);
-	q2 = _mm_nmacc_pd(z2, h3, q2);
-	q3 = _mm_nmacc_pd(z3, h3, q3);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(z3, h3));
-#endif
-	_mm_store_pd(&q[nb*ldq],q1);
-	_mm_store_pd(&q[(nb*ldq)+2],q2);
-	_mm_store_pd(&q[(nb*ldq)+4],q3);
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]);
-	q3 = _mm_load_pd(&q[((nb+1)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(x1, h1, q1);
-	q2 = _mm_nmacc_pd(x2, h1, q2);
-	q3 = _mm_nmacc_pd(x3, h1, q3);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(x3, h1));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(y1, h2, q1);
-	q2 = _mm_nmacc_pd(y2, h2, q2);
-	q3 = _mm_nmacc_pd(y3, h2, q3);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(y3, h2));
-#endif
-	_mm_store_pd(&q[(nb+1)*ldq],q1);
-	_mm_store_pd(&q[((nb+1)*ldq)+2],q2);
-	_mm_store_pd(&q[((nb+1)*ldq)+4],q3);
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]);
-	q3 = _mm_load_pd(&q[((nb+2)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(x1, h1, q1);
-	q2 = _mm_nmacc_pd(x2, h1, q2);
-	q3 = _mm_nmacc_pd(x3, h1, q3);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(x3, h1));
-#endif
-	_mm_store_pd(&q[(nb+2)*ldq],q1);
-	_mm_store_pd(&q[((nb+2)*ldq)+2],q2);
-	_mm_store_pd(&q[((nb+2)*ldq)+4],q3);
-}
-
-/**
- * Unrolled kernel that computes
- * 4 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [4 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m128d a1_1 = _mm_load_pd(&q[ldq*3]);
-	__m128d a2_1 = _mm_load_pd(&q[ldq*2]);
-	__m128d a3_1 = _mm_load_pd(&q[ldq]);
-	__m128d a4_1 = _mm_load_pd(&q[0]);
-
-	__m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]);
-	__m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-	__m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-	__m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	__m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-	__m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-
-#ifdef __ELPA_USE_FMA__
-	__m128d w1 = _mm_macc_pd(a3_1, h_4_3, a4_1);
-	w1 = _mm_macc_pd(a2_1, h_4_2, w1);
-	w1 = _mm_macc_pd(a1_1, h_4_1, w1);
-	__m128d z1 = _mm_macc_pd(a2_1, h_3_2, a3_1);
-	z1 = _mm_macc_pd(a1_1, h_3_1, z1);
-	__m128d y1 = _mm_macc_pd(a1_1, h_2_1, a2_1);
-	__m128d x1 = a1_1;
-#else
-	__m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1));
-	__m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2));
-	z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1));
-	__m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1));
-	__m128d x1 = a1_1;
-#endif
-
-	__m128d a1_2 = _mm_load_pd(&q[(ldq*3)+2]);
-	__m128d a2_2 = _mm_load_pd(&q[(ldq*2)+2]);
-	__m128d a3_2 = _mm_load_pd(&q[ldq+2]);
-	__m128d a4_2 = _mm_load_pd(&q[0+2]);
-
-#ifdef __ELPA_USE_FMA__
-	__m128d w2 = _mm_macc_pd(a3_2, h_4_3, a4_2);
-	w2 = _mm_macc_pd(a2_2, h_4_2, w2);
-	w2 = _mm_macc_pd(a1_2, h_4_1, w2);
-	__m128d z2 = _mm_macc_pd(a2_2, h_3_2, a3_2);
-	z2 = _mm_macc_pd(a1_2, h_3_1, z2);
-	__m128d y2 = _mm_macc_pd(a1_2, h_2_1, a2_2);
-	__m128d x2 = a1_2;
-#else
-	__m128d w2 = _mm_add_pd(a4_2, _mm_mul_pd(a3_2, h_4_3));
-	w2 = _mm_add_pd(w2, _mm_mul_pd(a2_2, h_4_2));
-	w2 = _mm_add_pd(w2, _mm_mul_pd(a1_2, h_4_1));
-	__m128d z2 = _mm_add_pd(a3_2, _mm_mul_pd(a2_2, h_3_2));
-	z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1));
-	__m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1));
-	__m128d x2 = a1_2;
-#endif
-
-	__m128d q1;
-	__m128d q2;
-
-	__m128d h1;
-	__m128d h2;
-	__m128d h3;
-	__m128d h4;
-
-	for(i = 4; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-3]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i-2]);
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]);
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]);
-
-		q1 = _mm_load_pd(&q[i*ldq]);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_macc_pd(q1, h1, x1);
-		y1 = _mm_macc_pd(q1, h2, y1);
-		z1 = _mm_macc_pd(q1, h3, z1);
-		w1 = _mm_macc_pd(q1, h4, w1);
-#else
-		x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-		y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-		z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-		w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
-#endif
-
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm_macc_pd(q2, h1, x2);
-		y2 = _mm_macc_pd(q2, h2, y2);
-		z2 = _mm_macc_pd(q2, h3, z2);
-		w2 = _mm_macc_pd(q2, h4, w2);
-#else
-		x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-		y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-		z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
-		w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4));
-#endif
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-	x2 = _mm_macc_pd(q2, h1, x2);
-	y1 = _mm_macc_pd(q1, h2, y1);
-	y2 = _mm_macc_pd(q2, h2, y2);
-	z1 = _mm_macc_pd(q1, h3, z1);
-	z2 = _mm_macc_pd(q2, h3, z2);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-	z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
-#endif
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-	h2 = _mm_loaddup_pd(&hh[(ldh*1)+nb-1]);
-
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]);
-
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-	x2 = _mm_macc_pd(q2, h1, x2);
-	y1 = _mm_macc_pd(q1, h2, y1);
-	y2 = _mm_macc_pd(q2, h2, y2);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-#endif
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]);
-
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-	x2 = _mm_macc_pd(q2, h1, x2);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [4 x nb+3]
-	/////////////////////////////////////////////////////
-
-	__m128d tau1 = _mm_loaddup_pd(&hh[0]);
-	__m128d tau2 = _mm_loaddup_pd(&hh[ldh]);
-	__m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]);
-	__m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]);
-
-	__m128d vs_1_2 = _mm_loaddup_pd(&s_1_2);
-	__m128d vs_1_3 = _mm_loaddup_pd(&s_1_3);
-	__m128d vs_2_3 = _mm_loaddup_pd(&s_2_3);
-	__m128d vs_1_4 = _mm_loaddup_pd(&s_1_4);
-	__m128d vs_2_4 = _mm_loaddup_pd(&s_2_4);
-	__m128d vs_3_4 = _mm_loaddup_pd(&s_3_4);
-
-	h1 = tau1;
-	x1 = _mm_mul_pd(x1, h1);
-	x2 = _mm_mul_pd(x2, h1);
-
-	h1 = tau2;
-	h2 = _mm_mul_pd(h1, vs_1_2);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_msub_pd(y1, h1, _mm_mul_pd(x1,h2));
-	y2 = _mm_msub_pd(y2, h1, _mm_mul_pd(x2,h2));
-#else
-	y1 = _mm_sub_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2));
-	y2 = _mm_sub_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2));
-#endif
-
-	h1 = tau3;
-	h2 = _mm_mul_pd(h1, vs_1_3);
-	h3 = _mm_mul_pd(h1, vs_2_3);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm_msub_pd(z1, h1, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)));
-	z2 = _mm_msub_pd(z2, h1, _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2)));
-#else
-	z1 = _mm_sub_pd(_mm_mul_pd(z1,h1), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)));
-	z2 = _mm_sub_pd(_mm_mul_pd(z2,h1), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)));
-#endif
-
-	h1 = tau4;
-	h2 = _mm_mul_pd(h1, vs_1_4);
-	h3 = _mm_mul_pd(h1, vs_2_4);
-	h4 = _mm_mul_pd(h1, vs_3_4);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm_msub_pd(w1, h1, _mm_macc_pd(z1, h4, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))));
-	w2 = _mm_msub_pd(w2, h1, _mm_macc_pd(z2, h4, _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2))));
-#else
-	w1 = _mm_sub_pd(_mm_mul_pd(w1,h1), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))));
-	w2 = _mm_sub_pd(_mm_mul_pd(w2,h1), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))));
-#endif
-
-	q1 = _mm_load_pd(&q[0]);
-	q2 = _mm_load_pd(&q[2]);
-	q1 = _mm_sub_pd(q1, w1);
-	q2 = _mm_sub_pd(q2, w2);
-	_mm_store_pd(&q[0],q1);
-	_mm_store_pd(&q[2],q2);
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	q1 = _mm_load_pd(&q[ldq]);
-	q2 = _mm_load_pd(&q[ldq+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_sub_pd(q1, _mm_macc_pd(w1, h4, z1));
-	q2 = _mm_sub_pd(q2, _mm_macc_pd(w2, h4, z2));
-#else
-	q1 = _mm_sub_pd(q1, _mm_add_pd(z1, _mm_mul_pd(w1, h4)));
-	q2 = _mm_sub_pd(q2, _mm_add_pd(z2, _mm_mul_pd(w2, h4)));
-#endif
-	_mm_store_pd(&q[ldq],q1);
-	_mm_store_pd(&q[ldq+2],q2);
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-	q1 = _mm_load_pd(&q[ldq*2]);
-	q2 = _mm_load_pd(&q[(ldq*2)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_sub_pd(q1, _mm_add_pd(y1, _mm_macc_pd(z1, h3, _mm_mul_pd(w1, h4))));
-	q2 = _mm_sub_pd(q2, _mm_add_pd(y2, _mm_macc_pd(z2, h3, _mm_mul_pd(w2, h4))));
-#else
-	q1 = _mm_sub_pd(q1, _mm_add_pd(y1, _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4))));
-	q2 = _mm_sub_pd(q2, _mm_add_pd(y2, _mm_add_pd(_mm_mul_pd(z2, h3), _mm_mul_pd(w2, h4))));
-#endif
-	_mm_store_pd(&q[ldq*2],q1);
-	_mm_store_pd(&q[(ldq*2)+2],q2);
-
-	h2 = _mm_loaddup_pd(&hh[ldh+1]);
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-	q1 = _mm_load_pd(&q[ldq*3]);
-	q2 = _mm_load_pd(&q[(ldq*3)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_sub_pd(q1, _mm_add_pd(x1, _mm_macc_pd(y1, h2, _mm_macc_pd(z1, h3, _mm_mul_pd(w1, h4)))));
-	q2 = _mm_sub_pd(q2, _mm_add_pd(x2, _mm_macc_pd(y2, h2, _mm_macc_pd(z2, h3, _mm_mul_pd(w2, h4)))));
-#else
-	q1 = _mm_sub_pd(q1, _mm_add_pd(x1, _mm_add_pd(_mm_mul_pd(y1, h2), _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4)))));
-	q2 = _mm_sub_pd(q2, _mm_add_pd(x2, _mm_add_pd(_mm_mul_pd(y2, h2), _mm_add_pd(_mm_mul_pd(z2, h3), _mm_mul_pd(w2, h4)))));
-#endif
-	_mm_store_pd(&q[ldq*3], q1);
-	_mm_store_pd(&q[(ldq*3)+2], q2);
-
-	for (i = 4; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-3]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i-2]);
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]);
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]);
-
-		q1 = _mm_load_pd(&q[i*ldq]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_macc_pd(w1, h4, _mm_mul_pd(z1, h3)), _mm_macc_pd(x1, h1, _mm_mul_pd(y1, h2))));
-#else
-		q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1, h4), _mm_mul_pd(z1, h3)), _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2))));
-#endif
-		_mm_store_pd(&q[i*ldq],q1);
-
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_sub_pd(q2, _mm_add_pd(_mm_macc_pd(w2, h4, _mm_mul_pd(z2, h3)), _mm_macc_pd(x2, h1, _mm_mul_pd(y2, h2))));
-#else
-		q2 = _mm_sub_pd(q2, _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2, h4), _mm_mul_pd(z2, h3)), _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2))));
-#endif
-		_mm_store_pd(&q[(i*ldq)+2],q2);
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_sub_pd(q1, _mm_macc_pd(x1, h1, _mm_macc_pd(z1, h3, _mm_mul_pd(y1, h2))));
-	q2 = _mm_sub_pd(q2, _mm_macc_pd(x2, h1, _mm_macc_pd(z2, h3, _mm_mul_pd(y2, h2))));
-#else
-	q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(y1, h2)) , _mm_mul_pd(x1, h1)));
-	q2 = _mm_sub_pd(q2, _mm_add_pd(_mm_add_pd(_mm_mul_pd(z2, h3), _mm_mul_pd(y2, h2)) , _mm_mul_pd(x2, h1)));
-#endif
-	_mm_store_pd(&q[nb*ldq],q1);
-	_mm_store_pd(&q[(nb*ldq)+2],q2);
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-1]);
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_sub_pd(q1, _mm_macc_pd(y1, h2, _mm_mul_pd(x1, h1)));
-	q2 = _mm_sub_pd(q2, _mm_macc_pd(y2, h2, _mm_mul_pd(x2, h1)));
-#else
-	q1 = _mm_sub_pd(q1, _mm_add_pd( _mm_mul_pd(y1, h2) , _mm_mul_pd(x1, h1)));
-	q2 = _mm_sub_pd(q2, _mm_add_pd( _mm_mul_pd(y2, h2) , _mm_mul_pd(x2, h1)));
-#endif
-	_mm_store_pd(&q[(nb+1)*ldq],q1);
-	_mm_store_pd(&q[((nb+1)*ldq)+2],q2);
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(x1, h1, q1);
-	q2 = _mm_nmacc_pd(x2, h1, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-#endif
-	_mm_store_pd(&q[(nb+2)*ldq],q1);
-	_mm_store_pd(&q[((nb+2)*ldq)+2],q2);
-}
-
-/**
- * Unrolled kernel that computes
- * 2 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_2_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [2 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m128d a1_1 = _mm_load_pd(&q[ldq*3]);
-	__m128d a2_1 = _mm_load_pd(&q[ldq*2]);
-	__m128d a3_1 = _mm_load_pd(&q[ldq]);
-	__m128d a4_1 = _mm_load_pd(&q[0]);
-
-	__m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]);
-	__m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-	__m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-	__m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	__m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-	__m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-
-#ifdef __ELPA_USE_FMA__
-	__m128d w1 = _mm_macc_pd(a3_1, h_4_3, a4_1);
-	w1 = _mm_macc_pd(a2_1, h_4_2, w1);
-	w1 = _mm_macc_pd(a1_1, h_4_1, w1);
-	__m128d z1 = _mm_macc_pd(a2_1, h_3_2, a3_1);
-	z1 = _mm_macc_pd(a1_1, h_3_1, z1);
-	__m128d y1 = _mm_macc_pd(a1_1, h_2_1, a2_1);
-	__m128d x1 = a1_1;
-#else
-	__m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1));
-	__m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2));
-	z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1));
-	__m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1));
-	__m128d x1 = a1_1;
-#endif
-
-	__m128d q1;
-
-	__m128d h1;
-	__m128d h2;
-	__m128d h3;
-	__m128d h4;
-
-	for(i = 4; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-3]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i-2]);
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]);
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]);
-
-		q1 = _mm_load_pd(&q[i*ldq]);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_macc_pd(q1, h1, x1);
-		y1 = _mm_macc_pd(q1, h2, y1);
-		z1 = _mm_macc_pd(q1, h3, z1);
-		w1 = _mm_macc_pd(q1, h4, w1);
-#else
-		x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-		y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-		z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-		w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
-#endif
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-	q1 = _mm_load_pd(&q[nb*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-	y1 = _mm_macc_pd(q1, h2, y1);
-	z1 = _mm_macc_pd(q1, h3, z1);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-#endif
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-	h2 = _mm_loaddup_pd(&hh[(ldh*1)+nb-1]);
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-	y1 = _mm_macc_pd(q1, h2, y1);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-#endif
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-#endif
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [2 x nb+3]
-	/////////////////////////////////////////////////////
-
-	__m128d tau1 = _mm_loaddup_pd(&hh[0]);
-	__m128d tau2 = _mm_loaddup_pd(&hh[ldh]);
-	__m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]);
-	__m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]);
-
-	__m128d vs_1_2 = _mm_loaddup_pd(&s_1_2);
-	__m128d vs_1_3 = _mm_loaddup_pd(&s_1_3);
-	__m128d vs_2_3 = _mm_loaddup_pd(&s_2_3);
-	__m128d vs_1_4 = _mm_loaddup_pd(&s_1_4);
-	__m128d vs_2_4 = _mm_loaddup_pd(&s_2_4);
-	__m128d vs_3_4 = _mm_loaddup_pd(&s_3_4);
-
-	h1 = tau1;
-	x1 = _mm_mul_pd(x1, h1);
-
-	h1 = tau2;
-	h2 = _mm_mul_pd(h1, vs_1_2);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_msub_pd(y1, h1, _mm_mul_pd(x1,h2));
-#else
-	y1 = _mm_sub_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2));
-#endif
-
-	h1 = tau3;
-	h2 = _mm_mul_pd(h1, vs_1_3);
-	h3 = _mm_mul_pd(h1, vs_2_3);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm_msub_pd(z1, h1, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)));
-#else
-	z1 = _mm_sub_pd(_mm_mul_pd(z1,h1), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)));
-#endif
-
-	h1 = tau4;
-	h2 = _mm_mul_pd(h1, vs_1_4);
-	h3 = _mm_mul_pd(h1, vs_2_4);
-	h4 = _mm_mul_pd(h1, vs_3_4);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm_msub_pd(w1, h1, _mm_macc_pd(z1, h4, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))));
-#else
-	w1 = _mm_sub_pd(_mm_mul_pd(w1,h1), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))));
-#endif
-
-	q1 = _mm_load_pd(&q[0]);
-	q1 = _mm_sub_pd(q1, w1);
-	_mm_store_pd(&q[0],q1);
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	q1 = _mm_load_pd(&q[ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_sub_pd(q1, _mm_macc_pd(w1, h4, z1));
-#else
-	q1 = _mm_sub_pd(q1, _mm_add_pd(z1, _mm_mul_pd(w1, h4)));
-#endif
-	_mm_store_pd(&q[ldq],q1);
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-	q1 = _mm_load_pd(&q[ldq*2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_sub_pd(q1, _mm_add_pd(y1, _mm_macc_pd(z1, h3, _mm_mul_pd(w1, h4))));
-#else
-	q1 = _mm_sub_pd(q1, _mm_add_pd(y1, _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4))));
-#endif
-	_mm_store_pd(&q[ldq*2],q1);
-
-	h2 = _mm_loaddup_pd(&hh[ldh+1]);
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-	q1 = _mm_load_pd(&q[ldq*3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_sub_pd(q1, _mm_add_pd(x1, _mm_macc_pd(y1, h2, _mm_macc_pd(z1, h3, _mm_mul_pd(w1, h4)))));
-#else
-	q1 = _mm_sub_pd(q1, _mm_add_pd(x1, _mm_add_pd(_mm_mul_pd(y1, h2), _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4)))));
-#endif
-	_mm_store_pd(&q[ldq*3], q1);
-
-	for (i = 4; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-3]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i-2]);
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]);
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]);
-
-		q1 = _mm_load_pd(&q[i*ldq]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_macc_pd(w1, h4, _mm_mul_pd(z1, h3)), _mm_macc_pd(x1, h1, _mm_mul_pd(y1, h2))));
-#else
-		q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1, h4), _mm_mul_pd(z1, h3)), _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2))));
-#endif
-		_mm_store_pd(&q[i*ldq],q1);
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-	q1 = _mm_load_pd(&q[nb*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_sub_pd(q1, _mm_macc_pd(x1, h1, _mm_macc_pd(z1, h3, _mm_mul_pd(y1, h2))));
-#else
-	q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(y1, h2)) , _mm_mul_pd(x1, h1)));
-#endif
-	_mm_store_pd(&q[nb*ldq],q1);
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-1]);
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_sub_pd(q1, _mm_macc_pd(y1, h2, _mm_mul_pd(x1, h1)));
-#else
-	q1 = _mm_sub_pd(q1, _mm_add_pd( _mm_mul_pd(y1, h2) , _mm_mul_pd(x1, h1)));
-#endif
-	_mm_store_pd(&q[(nb+1)*ldq],q1);
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(x1, h1, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-#endif
-	_mm_store_pd(&q[(nb+2)*ldq],q1);
-}
-#endif
diff --git a/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c b/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
deleted file mode 100644
index 5a033888c..000000000
--- a/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
+++ /dev/null
@@ -1,3104 +0,0 @@
-//    This file is part of ELPA.
-//
-//    The ELPA library was originally created by the ELPA consortium,
-//    consisting of the following organizations:
-//
-//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-//      Informatik,
-//    - Technische Universität München, Lehrstuhl für Informatik mit
-//      Schwerpunkt Wissenschaftliches Rechnen ,
-//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-//      and
-//    - IBM Deutschland GmbH
-//
-//    This particular source code file contains additions, changes and
-//    enhancements authored by Intel Corporation which is not part of
-//    the ELPA consortium.
-//
-//    More information can be found here:
-//    http://elpa.mpcdf.mpg.de/
-//
-//    ELPA is free software: you can redistribute it and/or modify
-//    it under the terms of the version 3 of the license of the
-//    GNU Lesser General Public License as published by the Free
-//    Software Foundation.
-//
-//    ELPA is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU Lesser General Public License for more details.
-//
-//    You should have received a copy of the GNU Lesser General Public License
-//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-//
-//    ELPA reflects a substantial effort on the part of the original
-//    ELPA consortium, and we ask you to respect the spirit of the
-//    license that we chose: i.e., please contribute any changes you
-//    may have back to the original ELPA library distribution, and keep
-//    any derivatives of ELPA under the same license that we chose for
-//    the original distribution, the GNU Lesser General Public License.
-//
-//
-// --------------------------------------------------------------------------------------------------
-//
-// This file contains the compute intensive kernels for the Householder transformations.
-// It should be compiled with the highest possible optimization level.
-//
-// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
-// On Intel Sandy Bridge use -O3 -mavx
-//
-// Copyright of the original code rests with the authors inside the ELPA
-// consortium. The copyright of any additional modifications shall rest
-// with their original authors, but shall adhere to the licensing terms
-// distributed along with the original code in the file "COPYING".
-//
-// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
-// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
-// --------------------------------------------------------------------------------------------------
-
-#include <x86intrin.h>
-
-#define __forceinline __attribute__((always_inline)) static
-
-#ifdef __USE_AVX128__
-#undef __AVX__
-#endif
-
-#ifdef __FMA4__
-#define __ELPA_USE_FMA__
-#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
-#define _mm256_NFMA_pd(a,b,c) _mm256_nmacc_pd(a,b,c)
-#define _mm256_FMSUB_pd(a,b,c) _mm256_msub(a,b,c)
-#endif
-
-#ifdef __AVX2__
-#define __ELPA_USE_FMA__
-#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
-#define _mm256_NFMA_pd(a,b,c) _mm256_fnmadd_pd(a,b,c)
-#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
-#endif
-
-//Forward declaration
-#ifdef __AVX__
-static void hh_trafo_kernel_4_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
-static void hh_trafo_kernel_8_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
-#else
-static void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
-static void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
-#endif
-
-void hexa_hh_trafo_real_sse_avx_6hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
-#if 0
-void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
-#endif
-
-void hexa_hh_trafo_real_sse_avx_6hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	int ldh = *pldh;
-
-	// calculating scalar products to compute
-	// 6 householder vectors simultaneously
-	double scalarprods[15];
-
-//	scalarprods[0] = s_1_2;
-//	scalarprods[1] = s_1_3;
-//	scalarprods[2] = s_2_3;
-//	scalarprods[3] = s_1_4;
-//	scalarprods[4] = s_2_4;
-//	scalarprods[5] = s_3_4;
-//	scalarprods[6] = s_1_5;
-//	scalarprods[7] = s_2_5;
-//	scalarprods[8] = s_3_5;
-//	scalarprods[9] = s_4_5;
-//	scalarprods[10] = s_1_6;
-//	scalarprods[11] = s_2_6;
-//	scalarprods[12] = s_3_6;
-//	scalarprods[13] = s_4_6;
-//	scalarprods[14] = s_5_6;
-
-	scalarprods[0] = hh[(ldh+1)];
-	scalarprods[1] = hh[(ldh*2)+2];
-	scalarprods[2] = hh[(ldh*2)+1];
-	scalarprods[3] = hh[(ldh*3)+3];
-	scalarprods[4] = hh[(ldh*3)+2];
-	scalarprods[5] = hh[(ldh*3)+1];
-	scalarprods[6] = hh[(ldh*4)+4];
-	scalarprods[7] = hh[(ldh*4)+3];
-	scalarprods[8] = hh[(ldh*4)+2];
-	scalarprods[9] = hh[(ldh*4)+1];
-	scalarprods[10] = hh[(ldh*5)+5];
-	scalarprods[11] = hh[(ldh*5)+4];
-	scalarprods[12] = hh[(ldh*5)+3];
-	scalarprods[13] = hh[(ldh*5)+2];
-	scalarprods[14] = hh[(ldh*5)+1];
-
-	// calculate scalar product of first and fourth householder vector
-	// loop counter = 2
-	scalarprods[0] += hh[1] * hh[(2+ldh)];
-	scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
-
-	// loop counter = 3
-	scalarprods[0] += hh[2] * hh[(3+ldh)];
-	scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
-
-	scalarprods[1] += hh[1] * hh[3+(ldh*2)];
-	scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
-	scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
-	scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
-
-	// loop counter = 4
-	scalarprods[0] += hh[3] * hh[(4+ldh)];
-	scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
-
-	scalarprods[1] += hh[2] * hh[4+(ldh*2)];
-	scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
-	scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
-	scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
-
-	scalarprods[3] += hh[1] * hh[4+(ldh*3)];
-	scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
-	scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
-
-	// loop counter = 5
-	scalarprods[0] += hh[4] * hh[(5+ldh)];
-	scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
-
-	scalarprods[1] += hh[3] * hh[5+(ldh*2)];
-	scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
-	scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
-	scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
-
-	scalarprods[3] += hh[2] * hh[5+(ldh*3)];
-	scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
-	scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
-
-	scalarprods[6] += hh[1] * hh[5+(ldh*4)];
-	scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
-
-	#pragma ivdep
-	for (i = 6; i < nb; i++)
-	{
-		scalarprods[0] += hh[i-1] * hh[(i+ldh)];
-		scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
-		scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
-		scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
-		scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
-
-		scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
-		scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
-		scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
-		scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
-
-		scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
-		scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
-		scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
-
-		scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
-		scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
-
-		scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
-	}
-
-//	printf("s_1_2: %f\n", scalarprods[0]);
-//	printf("s_1_3: %f\n", scalarprods[1]);
-//	printf("s_2_3: %f\n", scalarprods[2]);
-//	printf("s_1_4: %f\n", scalarprods[3]);
-//	printf("s_2_4: %f\n", scalarprods[4]);
-//	printf("s_3_4: %f\n", scalarprods[5]);
-//	printf("s_1_5: %f\n", scalarprods[6]);
-//	printf("s_2_5: %f\n", scalarprods[7]);
-//	printf("s_3_5: %f\n", scalarprods[8]);
-//	printf("s_4_5: %f\n", scalarprods[9]);
-//	printf("s_1_6: %f\n", scalarprods[10]);
-//	printf("s_2_6: %f\n", scalarprods[11]);
-//	printf("s_3_6: %f\n", scalarprods[12]);
-//	printf("s_4_6: %f\n", scalarprods[13]);
-//	printf("s_5_6: %f\n", scalarprods[14]);
-
-	// Production level kernel calls with padding
-#ifdef __AVX__
-	for (i = 0; i < nq-4; i+=8)
-	{
-		hh_trafo_kernel_8_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
-	}
-	if (nq == i)
-	{
-		return;
-	}
-	else
-	{
-		hh_trafo_kernel_4_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
-	}
-#else
-	for (i = 0; i < nq-2; i+=4)
-	{
-		hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
-	}
-	if (nq == i)
-	{
-		return;
-	}
-	else
-	{
-		hh_trafo_kernel_2_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
-	}
-#endif
-}
-
-#if 0
-void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	int ldh = *pldh;
-
-	// calculating scalar products to compute
-	// 6 householder vectors simultaneously
-	double scalarprods[15];
-
-//	scalarprods[0] = s_1_2;
-//	scalarprods[1] = s_1_3;
-//	scalarprods[2] = s_2_3;
-//	scalarprods[3] = s_1_4;
-//	scalarprods[4] = s_2_4;
-//	scalarprods[5] = s_3_4;
-//	scalarprods[6] = s_1_5;
-//	scalarprods[7] = s_2_5;
-//	scalarprods[8] = s_3_5;
-//	scalarprods[9] = s_4_5;
-//	scalarprods[10] = s_1_6;
-//	scalarprods[11] = s_2_6;
-//	scalarprods[12] = s_3_6;
-//	scalarprods[13] = s_4_6;
-//	scalarprods[14] = s_5_6;
-
-	scalarprods[0] = hh[(ldh+1)];
-	scalarprods[1] = hh[(ldh*2)+2];
-	scalarprods[2] = hh[(ldh*2)+1];
-	scalarprods[3] = hh[(ldh*3)+3];
-	scalarprods[4] = hh[(ldh*3)+2];
-	scalarprods[5] = hh[(ldh*3)+1];
-	scalarprods[6] = hh[(ldh*4)+4];
-	scalarprods[7] = hh[(ldh*4)+3];
-	scalarprods[8] = hh[(ldh*4)+2];
-	scalarprods[9] = hh[(ldh*4)+1];
-	scalarprods[10] = hh[(ldh*5)+5];
-	scalarprods[11] = hh[(ldh*5)+4];
-	scalarprods[12] = hh[(ldh*5)+3];
-	scalarprods[13] = hh[(ldh*5)+2];
-	scalarprods[14] = hh[(ldh*5)+1];
-
-	// calculate scalar product of first and fourth householder vector
-	// loop counter = 2
-	scalarprods[0] += hh[1] * hh[(2+ldh)];
-	scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
-
-	// loop counter = 3
-	scalarprods[0] += hh[2] * hh[(3+ldh)];
-	scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
-
-	scalarprods[1] += hh[1] * hh[3+(ldh*2)];
-	scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
-	scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
-	scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
-
-	// loop counter = 4
-	scalarprods[0] += hh[3] * hh[(4+ldh)];
-	scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
-
-	scalarprods[1] += hh[2] * hh[4+(ldh*2)];
-	scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
-	scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
-	scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
-
-	scalarprods[3] += hh[1] * hh[4+(ldh*3)];
-	scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
-	scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
-
-	// loop counter = 5
-	scalarprods[0] += hh[4] * hh[(5+ldh)];
-	scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
-
-	scalarprods[1] += hh[3] * hh[5+(ldh*2)];
-	scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
-	scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
-	scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
-
-	scalarprods[3] += hh[2] * hh[5+(ldh*3)];
-	scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
-	scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
-
-	scalarprods[6] += hh[1] * hh[5+(ldh*4)];
-	scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
-
-	#pragma ivdep
-	for (i = 6; i < nb; i++)
-	{
-		scalarprods[0] += hh[i-1] * hh[(i+ldh)];
-		scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
-		scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
-		scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
-		scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
-
-		scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
-		scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
-		scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
-		scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
-
-		scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
-		scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
-		scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
-
-		scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
-		scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
-
-		scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
-	}
-
-//	printf("s_1_2: %f\n", scalarprods[0]);
-//	printf("s_1_3: %f\n", scalarprods[1]);
-//	printf("s_2_3: %f\n", scalarprods[2]);
-//	printf("s_1_4: %f\n", scalarprods[3]);
-//	printf("s_2_4: %f\n", scalarprods[4]);
-//	printf("s_3_4: %f\n", scalarprods[5]);
-//	printf("s_1_5: %f\n", scalarprods[6]);
-//	printf("s_2_5: %f\n", scalarprods[7]);
-//	printf("s_3_5: %f\n", scalarprods[8]);
-//	printf("s_4_5: %f\n", scalarprods[9]);
-//	printf("s_1_6: %f\n", scalarprods[10]);
-//	printf("s_2_6: %f\n", scalarprods[11]);
-//	printf("s_3_6: %f\n", scalarprods[12]);
-//	printf("s_4_6: %f\n", scalarprods[13]);
-//	printf("s_5_6: %f\n", scalarprods[14]);
-
-	// Production level kernel calls with padding
-#ifdef __AVX__
-	for (i = 0; i < nq; i+=8)
-	{
-		hh_trafo_kernel_8_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
-	}
-#else
-	for (i = 0; i < nq; i+=4)
-	{
-		hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
-	}
-#endif
-}
-#endif
-
-#ifdef __AVX__
-/**
- * Unrolled kernel that computes
- * 8 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_8_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [8 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m256d a1_1 = _mm256_load_pd(&q[ldq*5]);
-	__m256d a2_1 = _mm256_load_pd(&q[ldq*4]);
-	__m256d a3_1 = _mm256_load_pd(&q[ldq*3]);
-	__m256d a4_1 = _mm256_load_pd(&q[ldq*2]);
-	__m256d a5_1 = _mm256_load_pd(&q[ldq]);
-	__m256d a6_1 = _mm256_load_pd(&q[0]);
-
-	__m256d h_6_5 = _mm256_broadcast_sd(&hh[(ldh*5)+1]);
-	__m256d h_6_4 = _mm256_broadcast_sd(&hh[(ldh*5)+2]);
-	__m256d h_6_3 = _mm256_broadcast_sd(&hh[(ldh*5)+3]);
-	__m256d h_6_2 = _mm256_broadcast_sd(&hh[(ldh*5)+4]);
-	__m256d h_6_1 = _mm256_broadcast_sd(&hh[(ldh*5)+5]);
-#ifdef __ELPA_USE_FMA__
-	register __m256d t1 = _mm256_FMA_pd(a5_1, h_6_5, a6_1);
-	t1 = _mm256_FMA_pd(a4_1, h_6_4, t1);
-	t1 = _mm256_FMA_pd(a3_1, h_6_3, t1);
-	t1 = _mm256_FMA_pd(a2_1, h_6_2, t1);
-	t1 = _mm256_FMA_pd(a1_1, h_6_1, t1);
-#else
-	register __m256d t1 = _mm256_add_pd(a6_1, _mm256_mul_pd(a5_1, h_6_5));
-	t1 = _mm256_add_pd(t1, _mm256_mul_pd(a4_1, h_6_4));
-	t1 = _mm256_add_pd(t1, _mm256_mul_pd(a3_1, h_6_3));
-	t1 = _mm256_add_pd(t1, _mm256_mul_pd(a2_1, h_6_2));
-	t1 = _mm256_add_pd(t1, _mm256_mul_pd(a1_1, h_6_1));
-#endif
-	__m256d h_5_4 = _mm256_broadcast_sd(&hh[(ldh*4)+1]);
-	__m256d h_5_3 = _mm256_broadcast_sd(&hh[(ldh*4)+2]);
-	__m256d h_5_2 = _mm256_broadcast_sd(&hh[(ldh*4)+3]);
-	__m256d h_5_1 = _mm256_broadcast_sd(&hh[(ldh*4)+4]);
-#ifdef __ELPA_USE_FMA__
-	register __m256d v1 = _mm256_FMA_pd(a4_1, h_5_4, a5_1);
-	v1 = _mm256_FMA_pd(a3_1, h_5_3, v1);
-	v1 = _mm256_FMA_pd(a2_1, h_5_2, v1);
-	v1 = _mm256_FMA_pd(a1_1, h_5_1, v1);
-#else
-	register __m256d v1 = _mm256_add_pd(a5_1, _mm256_mul_pd(a4_1, h_5_4));
-	v1 = _mm256_add_pd(v1, _mm256_mul_pd(a3_1, h_5_3));
-	v1 = _mm256_add_pd(v1, _mm256_mul_pd(a2_1, h_5_2));
-	v1 = _mm256_add_pd(v1, _mm256_mul_pd(a1_1, h_5_1));
-#endif
-	__m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	__m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-	__m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-#ifdef __ELPA_USE_FMA__
-	register __m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1);
-	w1 = _mm256_FMA_pd(a2_1, h_4_2, w1);
-	w1 = _mm256_FMA_pd(a1_1, h_4_1, w1);
-#else
-	register __m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1));
-#endif
-	__m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]);
-	__m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-	__m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-#ifdef __ELPA_USE_FMA__
-	register __m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1);
-	z1 = _mm256_FMA_pd(a1_1, h_3_1, z1);
-	register __m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1);
-#else
-	register __m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2));
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1));
-	register __m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1));
-#endif
-	register __m256d x1 = a1_1;
-
-
-	__m256d a1_2 = _mm256_load_pd(&q[(ldq*5)+4]);
-	__m256d a2_2 = _mm256_load_pd(&q[(ldq*4)+4]);
-	__m256d a3_2 = _mm256_load_pd(&q[(ldq*3)+4]);
-	__m256d a4_2 = _mm256_load_pd(&q[(ldq*2)+4]);
-	__m256d a5_2 = _mm256_load_pd(&q[(ldq)+4]);
-	__m256d a6_2 = _mm256_load_pd(&q[4]);
-
-#ifdef __ELPA_USE_FMA__
-	register __m256d t2 = _mm256_FMA_pd(a5_2, h_6_5, a6_2);
-	t2 = _mm256_FMA_pd(a4_2, h_6_4, t2);
-	t2 = _mm256_FMA_pd(a3_2, h_6_3, t2);
-	t2 = _mm256_FMA_pd(a2_2, h_6_2, t2);
-	t2 = _mm256_FMA_pd(a1_2, h_6_1, t2);
-	register __m256d v2 = _mm256_FMA_pd(a4_2, h_5_4, a5_2);
-	v2 = _mm256_FMA_pd(a3_2, h_5_3, v2);
-	v2 = _mm256_FMA_pd(a2_2, h_5_2, v2);
-	v2 = _mm256_FMA_pd(a1_2, h_5_1, v2);
-	register __m256d w2 = _mm256_FMA_pd(a3_2, h_4_3, a4_2);
-	w2 = _mm256_FMA_pd(a2_2, h_4_2, w2);
-	w2 = _mm256_FMA_pd(a1_2, h_4_1, w2);
-	register __m256d z2 = _mm256_FMA_pd(a2_2, h_3_2, a3_2);
-	z2 = _mm256_FMA_pd(a1_2, h_3_1, z2);
-	register __m256d y2 = _mm256_FMA_pd(a1_2, h_2_1, a2_2);
-#else
-	register __m256d t2 = _mm256_add_pd(a6_2, _mm256_mul_pd(a5_2, h_6_5));
-	t2 = _mm256_add_pd(t2, _mm256_mul_pd(a4_2, h_6_4));
-	t2 = _mm256_add_pd(t2, _mm256_mul_pd(a3_2, h_6_3));
-	t2 = _mm256_add_pd(t2, _mm256_mul_pd(a2_2, h_6_2));
-	t2 = _mm256_add_pd(t2, _mm256_mul_pd(a1_2, h_6_1));
-	register __m256d v2 = _mm256_add_pd(a5_2, _mm256_mul_pd(a4_2, h_5_4));
-	v2 = _mm256_add_pd(v2, _mm256_mul_pd(a3_2, h_5_3));
-	v2 = _mm256_add_pd(v2, _mm256_mul_pd(a2_2, h_5_2));
-	v2 = _mm256_add_pd(v2, _mm256_mul_pd(a1_2, h_5_1));
-	register __m256d w2 = _mm256_add_pd(a4_2, _mm256_mul_pd(a3_2, h_4_3));
-	w2 = _mm256_add_pd(w2, _mm256_mul_pd(a2_2, h_4_2));
-	w2 = _mm256_add_pd(w2, _mm256_mul_pd(a1_2, h_4_1));
-	register __m256d z2 = _mm256_add_pd(a3_2, _mm256_mul_pd(a2_2, h_3_2));
-	z2 = _mm256_add_pd(z2, _mm256_mul_pd(a1_2, h_3_1));
-	register __m256d y2 = _mm256_add_pd(a2_2, _mm256_mul_pd(a1_2, h_2_1));
-#endif
-	register __m256d x2 = a1_2;
-
-	__m256d q1;
-	__m256d q2;
-
-	__m256d h1;
-	__m256d h2;
-	__m256d h3;
-	__m256d h4;
-	__m256d h5;
-	__m256d h6;
-
-	for(i = 6; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-5]);
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_FMA_pd(q1, h1, x1);
-		x2 = _mm256_FMA_pd(q2, h1, x2);
-#else
-		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-		x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-#endif
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-4]);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm256_FMA_pd(q1, h2, y1);
-		y2 = _mm256_FMA_pd(q2, h2, y2);
-#else
-		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-		y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-#endif
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-3]);
-#ifdef __ELPA_USE_FMA__
-		z1 = _mm256_FMA_pd(q1, h3, z1);
-		z2 = _mm256_FMA_pd(q2, h3, z2);
-#else
-		z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-		z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
-#endif
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i-2]);
-#ifdef __ELPA_USE_FMA__
-		w1 = _mm256_FMA_pd(q1, h4, w1);
-		w2 = _mm256_FMA_pd(q2, h4, w2);
-#else
-		w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
-		w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4));
-#endif
-		h5 = _mm256_broadcast_sd(&hh[(ldh*4)+i-1]);
-#ifdef __ELPA_USE_FMA__
-		v1 = _mm256_FMA_pd(q1, h5, v1);
-		v2 = _mm256_FMA_pd(q2, h5, v2);
-#else
-		v1 = _mm256_add_pd(v1, _mm256_mul_pd(q1,h5));
-		v2 = _mm256_add_pd(v2, _mm256_mul_pd(q2,h5));
-#endif
-		h6 = _mm256_broadcast_sd(&hh[(ldh*5)+i]);
-#ifdef __ELPA_USE_FMA__
-		t1 = _mm256_FMA_pd(q1, h6, t1);
-		t2 = _mm256_FMA_pd(q2, h6, t2);
-#else
-		t1 = _mm256_add_pd(t1, _mm256_mul_pd(q1,h6));
-		t2 = _mm256_add_pd(t2, _mm256_mul_pd(q2,h6));
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-5]);
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-4]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-	y2 = _mm256_FMA_pd(q2, h2, y2);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-	y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMA_pd(q1, h3, z1);
-	z2 = _mm256_FMA_pd(q2, h3, z2);
-#else
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-	z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm256_FMA_pd(q1, h4, w1);
-	w2 = _mm256_FMA_pd(q2, h4, w2);
-#else
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
-	w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	v1 = _mm256_FMA_pd(q1, h5, v1);
-	v2 = _mm256_FMA_pd(q2, h5, v2);
-#else
-	v1 = _mm256_add_pd(v1, _mm256_mul_pd(q1,h5));
-	v2 = _mm256_add_pd(v2, _mm256_mul_pd(q2,h5));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-4]);
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-	y2 = _mm256_FMA_pd(q2, h2, y2);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-	y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMA_pd(q1, h3, z1);
-	z2 = _mm256_FMA_pd(q2, h3, z2);
-#else
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-	z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm256_FMA_pd(q1, h4, w1);
-	w2 = _mm256_FMA_pd(q2, h4, w2);
-#else
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
-	w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-	y2 = _mm256_FMA_pd(q2, h2, y2);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-	y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMA_pd(q1, h3, z1);
-	z2 = _mm256_FMA_pd(q2, h3, z2);
-#else
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-	z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-	q1 = _mm256_load_pd(&q[(nb+3)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+3)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-	y2 = _mm256_FMA_pd(q2, h2, y2);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-	y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+4)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+4)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Apply tau, correct wrong calculation using pre-calculated scalar products
-	/////////////////////////////////////////////////////
-
-	__m256d tau1 = _mm256_broadcast_sd(&hh[0]);
-	x1 = _mm256_mul_pd(x1, tau1);
-	x2 = _mm256_mul_pd(x2, tau1);
-
-	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
-	__m256d vs_1_2 = _mm256_broadcast_sd(&scalarprods[0]);
-	h2 = _mm256_mul_pd(tau2, vs_1_2);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMSUB_pd(y1, tau2, _mm256_mul_pd(x1,h2));
-	y2 = _mm256_FMSUB_pd(y2, tau2, _mm256_mul_pd(x2,h2));
-#else
-	y1 = _mm256_sub_pd(_mm256_mul_pd(y1,tau2), _mm256_mul_pd(x1,h2));
-	y2 = _mm256_sub_pd(_mm256_mul_pd(y2,tau2), _mm256_mul_pd(x2,h2));
-#endif
-
-	__m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]);
-	__m256d vs_1_3 = _mm256_broadcast_sd(&scalarprods[1]);
-	__m256d vs_2_3 = _mm256_broadcast_sd(&scalarprods[2]);
-	h2 = _mm256_mul_pd(tau3, vs_1_3);
-	h3 = _mm256_mul_pd(tau3, vs_2_3);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMSUB_pd(z1, tau3, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)));
-	z2 = _mm256_FMSUB_pd(z2, tau3, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)));
-#else
-	z1 = _mm256_sub_pd(_mm256_mul_pd(z1,tau3), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)));
-	z2 = _mm256_sub_pd(_mm256_mul_pd(z2,tau3), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)));
-#endif
-
-	__m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]);
-	__m256d vs_1_4 = _mm256_broadcast_sd(&scalarprods[3]);
-	__m256d vs_2_4 = _mm256_broadcast_sd(&scalarprods[4]);
-	h2 = _mm256_mul_pd(tau4, vs_1_4);
-	h3 = _mm256_mul_pd(tau4, vs_2_4);
-	__m256d vs_3_4 = _mm256_broadcast_sd(&scalarprods[5]);
-	h4 = _mm256_mul_pd(tau4, vs_3_4);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm256_FMSUB_pd(w1, tau4, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))));
-	w2 = _mm256_FMSUB_pd(w2, tau4, _mm256_FMA_pd(z2, h4, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))));
-#else
-	w1 = _mm256_sub_pd(_mm256_mul_pd(w1,tau4), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))));
-	w2 = _mm256_sub_pd(_mm256_mul_pd(w2,tau4), _mm256_add_pd(_mm256_mul_pd(z2,h4), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))));
-#endif
-
-	__m256d tau5 = _mm256_broadcast_sd(&hh[ldh*4]);
-	__m256d vs_1_5 = _mm256_broadcast_sd(&scalarprods[6]);
-	__m256d vs_2_5 = _mm256_broadcast_sd(&scalarprods[7]);
-	h2 = _mm256_mul_pd(tau5, vs_1_5);
-	h3 = _mm256_mul_pd(tau5, vs_2_5);
-	__m256d vs_3_5 = _mm256_broadcast_sd(&scalarprods[8]);
-	__m256d vs_4_5 = _mm256_broadcast_sd(&scalarprods[9]);
-	h4 = _mm256_mul_pd(tau5, vs_3_5);
-	h5 = _mm256_mul_pd(tau5, vs_4_5);
-#ifdef __ELPA_USE_FMA__
-	v1 = _mm256_FMSUB_pd(v1, tau5, _mm256_add_pd(_mm256_FMA_pd(w1, h5, _mm256_mul_pd(z1,h4)), _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))));
-	v2 = _mm256_FMSUB_pd(v2, tau5, _mm256_add_pd(_mm256_FMA_pd(w2, h5, _mm256_mul_pd(z2,h4)), _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))));
-#else
-	v1 = _mm256_sub_pd(_mm256_mul_pd(v1,tau5), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1,h5), _mm256_mul_pd(z1,h4)), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))));
-	v2 = _mm256_sub_pd(_mm256_mul_pd(v2,tau5), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w2,h5), _mm256_mul_pd(z2,h4)), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))));
-#endif
-
-	__m256d tau6 = _mm256_broadcast_sd(&hh[ldh*5]);
-	__m256d vs_1_6 = _mm256_broadcast_sd(&scalarprods[10]);
-	__m256d vs_2_6 = _mm256_broadcast_sd(&scalarprods[11]);
-	h2 = _mm256_mul_pd(tau6, vs_1_6);
-	h3 = _mm256_mul_pd(tau6, vs_2_6);
-	__m256d vs_3_6 = _mm256_broadcast_sd(&scalarprods[12]);
-	__m256d vs_4_6 = _mm256_broadcast_sd(&scalarprods[13]);
-	__m256d vs_5_6 = _mm256_broadcast_sd(&scalarprods[14]);
-	h4 = _mm256_mul_pd(tau6, vs_3_6);
-	h5 = _mm256_mul_pd(tau6, vs_4_6);
-	h6 = _mm256_mul_pd(tau6, vs_5_6);
-#ifdef __ELPA_USE_FMA__
-	t1 = _mm256_FMSUB_pd(t1, tau6, _mm256_FMA_pd(v1, h6, _mm256_add_pd(_mm256_FMA_pd(w1, h5, _mm256_mul_pd(z1,h4)), _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)))));
-	t2 = _mm256_FMSUB_pd(t2, tau6, _mm256_FMA_pd(v2, h6, _mm256_add_pd(_mm256_FMA_pd(w2, h5, _mm256_mul_pd(z2,h4)), _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)))));
-#else
-	t1 = _mm256_sub_pd(_mm256_mul_pd(t1,tau6), _mm256_add_pd( _mm256_mul_pd(v1,h6), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1,h5), _mm256_mul_pd(z1,h4)), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)))));
-	t2 = _mm256_sub_pd(_mm256_mul_pd(t2,tau6), _mm256_add_pd( _mm256_mul_pd(v2,h6), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w2,h5), _mm256_mul_pd(z2,h4)), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)))));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [8 x nb+3]
-	/////////////////////////////////////////////////////
-
-	q1 = _mm256_load_pd(&q[0]);
-	q2 = _mm256_load_pd(&q[4]);
-	q1 = _mm256_sub_pd(q1, t1);
-	q2 = _mm256_sub_pd(q2, t2);
-	_mm256_store_pd(&q[0],q1);
-	_mm256_store_pd(&q[4],q2);
-
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+1]);
-	q1 = _mm256_load_pd(&q[ldq]);
-	q2 = _mm256_load_pd(&q[(ldq+4)]);
-	q1 = _mm256_sub_pd(q1, v1);
-	q2 = _mm256_sub_pd(q2, v2);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-	q2 = _mm256_NFMA_pd(t2, h6, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6));
-#endif
-	_mm256_store_pd(&q[ldq],q1);
-	_mm256_store_pd(&q[(ldq+4)],q2);
-
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+1]);
-	q1 = _mm256_load_pd(&q[ldq*2]);
-	q2 = _mm256_load_pd(&q[(ldq*2)+4]);
-	q1 = _mm256_sub_pd(q1, w1);
-	q2 = _mm256_sub_pd(q2, w2);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-	q2 = _mm256_NFMA_pd(v2, h5, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5));
-#endif
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-	q2 = _mm256_NFMA_pd(t2, h6, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6));
-#endif
-	_mm256_store_pd(&q[ldq*2],q1);
-	_mm256_store_pd(&q[(ldq*2)+4],q2);
-
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	q1 = _mm256_load_pd(&q[ldq*3]);
-	q2 = _mm256_load_pd(&q[(ldq*3)+4]);
-	q1 = _mm256_sub_pd(q1, z1);
-	q2 = _mm256_sub_pd(q2, z2);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-	q2 = _mm256_NFMA_pd(w2, h4, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-	q2 = _mm256_NFMA_pd(v2, h5, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5));
-#endif
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-	q2 = _mm256_NFMA_pd(t2, h6, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6));
-#endif
-	_mm256_store_pd(&q[ldq*3],q1);
-	_mm256_store_pd(&q[(ldq*3)+4],q2);
-
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-	q1 = _mm256_load_pd(&q[ldq*4]);
-	q2 = _mm256_load_pd(&q[(ldq*4)+4]);
-	q1 = _mm256_sub_pd(q1, y1);
-	q2 = _mm256_sub_pd(q2, y2);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-	q2 = _mm256_NFMA_pd(z2, h3, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-	q2 = _mm256_NFMA_pd(w2, h4, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-	q2 = _mm256_NFMA_pd(v2, h5, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5));
-#endif
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-	q2 = _mm256_NFMA_pd(t2, h6, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6));
-#endif
-	_mm256_store_pd(&q[ldq*4],q1);
-	_mm256_store_pd(&q[(ldq*4)+4],q2);
-
-	h2 = _mm256_broadcast_sd(&hh[(ldh)+1]);
-	q1 = _mm256_load_pd(&q[ldq*5]);
-	q2 = _mm256_load_pd(&q[(ldq*5)+4]);
-	q1 = _mm256_sub_pd(q1, x1);
-	q2 = _mm256_sub_pd(q2, x2);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-	q2 = _mm256_NFMA_pd(y2, h2, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-	q2 = _mm256_NFMA_pd(z2, h3, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-	q2 = _mm256_NFMA_pd(w2, h4, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-	q2 = _mm256_NFMA_pd(v2, h5, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5));
-#endif
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+5]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-	q2 = _mm256_NFMA_pd(t2, h6, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6));
-#endif
-	_mm256_store_pd(&q[ldq*5],q1);
-	_mm256_store_pd(&q[(ldq*5)+4],q2);
-
-	for (i = 6; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		h1 = _mm256_broadcast_sd(&hh[i-5]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(x1, h1, q1);
-		q2 = _mm256_NFMA_pd(x2, h1, q2);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1));
-#endif
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-4]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(y1, h2, q1);
-		q2 = _mm256_NFMA_pd(y2, h2, q2);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2));
-#endif
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-3]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(z1, h3, q1);
-		q2 = _mm256_NFMA_pd(z2, h3, q2);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3));
-#endif
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i-2]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(w1, h4, q1);
-		q2 = _mm256_NFMA_pd(w2, h4, q2);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4));
-#endif
-		h5 = _mm256_broadcast_sd(&hh[(ldh*4)+i-1]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(v1, h5, q1);
-		q2 = _mm256_NFMA_pd(v2, h5, q2);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5));
-#endif
-		h6 = _mm256_broadcast_sd(&hh[(ldh*5)+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(t1, h6, q1);
-		q2 = _mm256_NFMA_pd(t2, h6, q2);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6));
-#endif
-		_mm256_store_pd(&q[i*ldq],q1);
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-5]);
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-	q2 = _mm256_NFMA_pd(x2, h1, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-	q2 = _mm256_NFMA_pd(y2, h2, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-	q2 = _mm256_NFMA_pd(z2, h3, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-	q2 = _mm256_NFMA_pd(w2, h4, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-	q2 = _mm256_NFMA_pd(v2, h5, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5));
-#endif
-	_mm256_store_pd(&q[nb*ldq],q1);
-	_mm256_store_pd(&q[(nb*ldq)+4],q2);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-4]);
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-	q2 = _mm256_NFMA_pd(x2, h1, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-	q2 = _mm256_NFMA_pd(y2, h2, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-	q2 = _mm256_NFMA_pd(z2, h3, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-	q2 = _mm256_NFMA_pd(w2, h4, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4));
-#endif
-	_mm256_store_pd(&q[(nb+1)*ldq],q1);
-	_mm256_store_pd(&q[((nb+1)*ldq)+4],q2);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-	q2 = _mm256_NFMA_pd(x2, h1, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-	q2 = _mm256_NFMA_pd(y2, h2, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-	q2 = _mm256_NFMA_pd(z2, h3, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3));
-#endif
-	_mm256_store_pd(&q[(nb+2)*ldq],q1);
-	_mm256_store_pd(&q[((nb+2)*ldq)+4],q2);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-	q1 = _mm256_load_pd(&q[(nb+3)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+3)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-	q2 = _mm256_NFMA_pd(x2, h1, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-	q2 = _mm256_NFMA_pd(y2, h2, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2));
-#endif
-	_mm256_store_pd(&q[(nb+3)*ldq],q1);
-	_mm256_store_pd(&q[((nb+3)*ldq)+4],q2);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+4)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+4)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-	q2 = _mm256_NFMA_pd(x2, h1, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1));
-#endif
-	_mm256_store_pd(&q[(nb+4)*ldq],q1);
-	_mm256_store_pd(&q[((nb+4)*ldq)+4],q2);
-}
-
-/**
- * Unrolled kernel that computes
- * 4 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_4_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [8 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m256d a1_1 = _mm256_load_pd(&q[ldq*5]);
-	__m256d a2_1 = _mm256_load_pd(&q[ldq*4]);
-	__m256d a3_1 = _mm256_load_pd(&q[ldq*3]);
-	__m256d a4_1 = _mm256_load_pd(&q[ldq*2]);
-	__m256d a5_1 = _mm256_load_pd(&q[ldq]);
-	__m256d a6_1 = _mm256_load_pd(&q[0]);
-
-	__m256d h_6_5 = _mm256_broadcast_sd(&hh[(ldh*5)+1]);
-	__m256d h_6_4 = _mm256_broadcast_sd(&hh[(ldh*5)+2]);
-	__m256d h_6_3 = _mm256_broadcast_sd(&hh[(ldh*5)+3]);
-	__m256d h_6_2 = _mm256_broadcast_sd(&hh[(ldh*5)+4]);
-	__m256d h_6_1 = _mm256_broadcast_sd(&hh[(ldh*5)+5]);
-#ifdef __ELPA_USE_FMA__
-	register __m256d t1 = _mm256_FMA_pd(a5_1, h_6_5, a6_1);
-	t1 = _mm256_FMA_pd(a4_1, h_6_4, t1);
-	t1 = _mm256_FMA_pd(a3_1, h_6_3, t1);
-	t1 = _mm256_FMA_pd(a2_1, h_6_2, t1);
-	t1 = _mm256_FMA_pd(a1_1, h_6_1, t1);
-#else
-	register __m256d t1 = _mm256_add_pd(a6_1, _mm256_mul_pd(a5_1, h_6_5));
-	t1 = _mm256_add_pd(t1, _mm256_mul_pd(a4_1, h_6_4));
-	t1 = _mm256_add_pd(t1, _mm256_mul_pd(a3_1, h_6_3));
-	t1 = _mm256_add_pd(t1, _mm256_mul_pd(a2_1, h_6_2));
-	t1 = _mm256_add_pd(t1, _mm256_mul_pd(a1_1, h_6_1));
-#endif
-	__m256d h_5_4 = _mm256_broadcast_sd(&hh[(ldh*4)+1]);
-	__m256d h_5_3 = _mm256_broadcast_sd(&hh[(ldh*4)+2]);
-	__m256d h_5_2 = _mm256_broadcast_sd(&hh[(ldh*4)+3]);
-	__m256d h_5_1 = _mm256_broadcast_sd(&hh[(ldh*4)+4]);
-#ifdef __ELPA_USE_FMA__
-	register __m256d v1 = _mm256_FMA_pd(a4_1, h_5_4, a5_1);
-	v1 = _mm256_FMA_pd(a3_1, h_5_3, v1);
-	v1 = _mm256_FMA_pd(a2_1, h_5_2, v1);
-	v1 = _mm256_FMA_pd(a1_1, h_5_1, v1);
-#else
-	register __m256d v1 = _mm256_add_pd(a5_1, _mm256_mul_pd(a4_1, h_5_4));
-	v1 = _mm256_add_pd(v1, _mm256_mul_pd(a3_1, h_5_3));
-	v1 = _mm256_add_pd(v1, _mm256_mul_pd(a2_1, h_5_2));
-	v1 = _mm256_add_pd(v1, _mm256_mul_pd(a1_1, h_5_1));
-#endif
-	__m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	__m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-	__m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-#ifdef __ELPA_USE_FMA__
-	register __m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1);
-	w1 = _mm256_FMA_pd(a2_1, h_4_2, w1);
-	w1 = _mm256_FMA_pd(a1_1, h_4_1, w1);
-#else
-	register __m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1));
-#endif
-	__m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]);
-	__m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-	__m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-#ifdef __ELPA_USE_FMA__
-	register __m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1);
-	z1 = _mm256_FMA_pd(a1_1, h_3_1, z1);
-	register __m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1);
-#else
-	register __m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2));
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1));
-	register __m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1));
-#endif
-	register __m256d x1 = a1_1;
-
-	__m256d q1;
-
-	__m256d h1;
-	__m256d h2;
-	__m256d h3;
-	__m256d h4;
-	__m256d h5;
-	__m256d h6;
-
-	for(i = 6; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-5]);
-		q1 = _mm256_load_pd(&q[i*ldq]);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_FMA_pd(q1, h1, x1);
-#else
-		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-#endif
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-4]);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm256_FMA_pd(q1, h2, y1);
-#else
-		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-#endif
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-3]);
-#ifdef __ELPA_USE_FMA__
-		z1 = _mm256_FMA_pd(q1, h3, z1);
-#else
-		z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-#endif
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i-2]);
-#ifdef __ELPA_USE_FMA__
-		w1 = _mm256_FMA_pd(q1, h4, w1);
-#else
-		w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
-#endif
-		h5 = _mm256_broadcast_sd(&hh[(ldh*4)+i-1]);
-#ifdef __ELPA_USE_FMA__
-		v1 = _mm256_FMA_pd(q1, h5, v1);
-#else
-		v1 = _mm256_add_pd(v1, _mm256_mul_pd(q1,h5));
-#endif
-		h6 = _mm256_broadcast_sd(&hh[(ldh*5)+i]);
-#ifdef __ELPA_USE_FMA__
-		t1 = _mm256_FMA_pd(q1, h6, t1);
-#else
-		t1 = _mm256_add_pd(t1, _mm256_mul_pd(q1,h6));
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-5]);
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-4]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMA_pd(q1, h3, z1);
-#else
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm256_FMA_pd(q1, h4, w1);
-#else
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	v1 = _mm256_FMA_pd(q1, h5, v1);
-#else
-	v1 = _mm256_add_pd(v1, _mm256_mul_pd(q1,h5));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-4]);
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMA_pd(q1, h3, z1);
-#else
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm256_FMA_pd(q1, h4, w1);
-#else
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMA_pd(q1, h3, z1);
-#else
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-	q1 = _mm256_load_pd(&q[(nb+3)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+4)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Apply tau, correct wrong calculation using pre-calculated scalar products
-	/////////////////////////////////////////////////////
-
-	__m256d tau1 = _mm256_broadcast_sd(&hh[0]);
-	x1 = _mm256_mul_pd(x1, tau1);
-
-	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
-	__m256d vs_1_2 = _mm256_broadcast_sd(&scalarprods[0]);
-	h2 = _mm256_mul_pd(tau2, vs_1_2);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMSUB_pd(y1, tau2, _mm256_mul_pd(x1,h2));
-#else
-	y1 = _mm256_sub_pd(_mm256_mul_pd(y1,tau2), _mm256_mul_pd(x1,h2));
-#endif
-
-	__m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]);
-	__m256d vs_1_3 = _mm256_broadcast_sd(&scalarprods[1]);
-	__m256d vs_2_3 = _mm256_broadcast_sd(&scalarprods[2]);
-	h2 = _mm256_mul_pd(tau3, vs_1_3);
-	h3 = _mm256_mul_pd(tau3, vs_2_3);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMSUB_pd(z1, tau3, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)));
-#else
-	z1 = _mm256_sub_pd(_mm256_mul_pd(z1,tau3), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)));
-#endif
-
-	__m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]);
-	__m256d vs_1_4 = _mm256_broadcast_sd(&scalarprods[3]);
-	__m256d vs_2_4 = _mm256_broadcast_sd(&scalarprods[4]);
-	h2 = _mm256_mul_pd(tau4, vs_1_4);
-	h3 = _mm256_mul_pd(tau4, vs_2_4);
-	__m256d vs_3_4 = _mm256_broadcast_sd(&scalarprods[5]);
-	h4 = _mm256_mul_pd(tau4, vs_3_4);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm256_FMSUB_pd(w1, tau4, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))));
-#else
-	w1 = _mm256_sub_pd(_mm256_mul_pd(w1,tau4), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))));
-#endif
-
-	__m256d tau5 = _mm256_broadcast_sd(&hh[ldh*4]);
-	__m256d vs_1_5 = _mm256_broadcast_sd(&scalarprods[6]);
-	__m256d vs_2_5 = _mm256_broadcast_sd(&scalarprods[7]);
-	h2 = _mm256_mul_pd(tau5, vs_1_5);
-	h3 = _mm256_mul_pd(tau5, vs_2_5);
-	__m256d vs_3_5 = _mm256_broadcast_sd(&scalarprods[8]);
-	__m256d vs_4_5 = _mm256_broadcast_sd(&scalarprods[9]);
-	h4 = _mm256_mul_pd(tau5, vs_3_5);
-	h5 = _mm256_mul_pd(tau5, vs_4_5);
-#ifdef __ELPA_USE_FMA__
-	v1 = _mm256_FMSUB_pd(v1, tau5, _mm256_add_pd(_mm256_FMA_pd(w1, h5, _mm256_mul_pd(z1,h4)), _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))));
-#else
-	v1 = _mm256_sub_pd(_mm256_mul_pd(v1,tau5), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1,h5), _mm256_mul_pd(z1,h4)), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))));
-#endif
-
-	__m256d tau6 = _mm256_broadcast_sd(&hh[ldh*5]);
-	__m256d vs_1_6 = _mm256_broadcast_sd(&scalarprods[10]);
-	__m256d vs_2_6 = _mm256_broadcast_sd(&scalarprods[11]);
-	h2 = _mm256_mul_pd(tau6, vs_1_6);
-	h3 = _mm256_mul_pd(tau6, vs_2_6);
-	__m256d vs_3_6 = _mm256_broadcast_sd(&scalarprods[12]);
-	__m256d vs_4_6 = _mm256_broadcast_sd(&scalarprods[13]);
-	__m256d vs_5_6 = _mm256_broadcast_sd(&scalarprods[14]);
-	h4 = _mm256_mul_pd(tau6, vs_3_6);
-	h5 = _mm256_mul_pd(tau6, vs_4_6);
-	h6 = _mm256_mul_pd(tau6, vs_5_6);
-#ifdef __ELPA_USE_FMA__
-	t1 = _mm256_FMSUB_pd(t1, tau6, _mm256_FMA_pd(v1, h6, _mm256_add_pd(_mm256_FMA_pd(w1, h5, _mm256_mul_pd(z1,h4)), _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)))));
-#else
-	t1 = _mm256_sub_pd(_mm256_mul_pd(t1,tau6), _mm256_add_pd( _mm256_mul_pd(v1,h6), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1,h5), _mm256_mul_pd(z1,h4)), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)))));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [4 x nb+3]
-	/////////////////////////////////////////////////////
-
-	q1 = _mm256_load_pd(&q[0]);
-	q1 = _mm256_sub_pd(q1, t1);
-	_mm256_store_pd(&q[0],q1);
-
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+1]);
-	q1 = _mm256_load_pd(&q[ldq]);
-	q1 = _mm256_sub_pd(q1, v1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-#endif
-	_mm256_store_pd(&q[ldq],q1);
-
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+1]);
-	q1 = _mm256_load_pd(&q[ldq*2]);
-	q1 = _mm256_sub_pd(q1, w1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-#endif
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-#endif
-	_mm256_store_pd(&q[ldq*2],q1);
-
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	q1 = _mm256_load_pd(&q[ldq*3]);
-	q1 = _mm256_sub_pd(q1, z1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-#endif
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-#endif
-	_mm256_store_pd(&q[ldq*3],q1);
-
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-	q1 = _mm256_load_pd(&q[ldq*4]);
-	q1 = _mm256_sub_pd(q1, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-#endif
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-#endif
-	_mm256_store_pd(&q[ldq*4],q1);
-
-	h2 = _mm256_broadcast_sd(&hh[(ldh)+1]);
-	q1 = _mm256_load_pd(&q[ldq*5]);
-	q1 = _mm256_sub_pd(q1, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-#endif
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+5]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-#endif
-	_mm256_store_pd(&q[ldq*5],q1);
-
-	for (i = 6; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		h1 = _mm256_broadcast_sd(&hh[i-5]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(x1, h1, q1);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-#endif
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-4]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(y1, h2, q1);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-#endif
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-3]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(z1, h3, q1);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-#endif
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i-2]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(w1, h4, q1);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-#endif
-		h5 = _mm256_broadcast_sd(&hh[(ldh*4)+i-1]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(v1, h5, q1);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-#endif
-		h6 = _mm256_broadcast_sd(&hh[(ldh*5)+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(t1, h6, q1);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-#endif
-		_mm256_store_pd(&q[i*ldq],q1);
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-5]);
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-#endif
-	_mm256_store_pd(&q[nb*ldq],q1);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-4]);
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-#endif
-	_mm256_store_pd(&q[(nb+1)*ldq],q1);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-#endif
-	_mm256_store_pd(&q[(nb+2)*ldq],q1);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-	q1 = _mm256_load_pd(&q[(nb+3)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-#endif
-	_mm256_store_pd(&q[(nb+3)*ldq],q1);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+4)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-#endif
-	_mm256_store_pd(&q[(nb+4)*ldq],q1);
-}
-#else
-/**
- * Unrolled kernel that computes
- * 4 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [4 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m128d a1_1 = _mm_load_pd(&q[ldq*5]);
-	__m128d a2_1 = _mm_load_pd(&q[ldq*4]);
-	__m128d a3_1 = _mm_load_pd(&q[ldq*3]);
-	__m128d a4_1 = _mm_load_pd(&q[ldq*2]);
-	__m128d a5_1 = _mm_load_pd(&q[ldq]);
-	__m128d a6_1 = _mm_load_pd(&q[0]);
-
-	__m128d h_6_5 = _mm_loaddup_pd(&hh[(ldh*5)+1]);
-	__m128d h_6_4 = _mm_loaddup_pd(&hh[(ldh*5)+2]);
-	__m128d h_6_3 = _mm_loaddup_pd(&hh[(ldh*5)+3]);
-	__m128d h_6_2 = _mm_loaddup_pd(&hh[(ldh*5)+4]);
-	__m128d h_6_1 = _mm_loaddup_pd(&hh[(ldh*5)+5]);
-#ifdef __ELPA_USE_FMA__
-	register __m128d t1 = _mm_macc_pd(a5_1, h_6_5, a6_1);
-	t1 = _mm_macc_pd(a4_1, h_6_4, t1);
-	t1 = _mm_macc_pd(a3_1, h_6_3, t1);
-	t1 = _mm_macc_pd(a2_1, h_6_2, t1);
-	t1 = _mm_macc_pd(a1_1, h_6_1, t1);
-#else
-	register __m128d t1 = _mm_add_pd(a6_1, _mm_mul_pd(a5_1, h_6_5));
-	t1 = _mm_add_pd(t1, _mm_mul_pd(a4_1, h_6_4));
-	t1 = _mm_add_pd(t1, _mm_mul_pd(a3_1, h_6_3));
-	t1 = _mm_add_pd(t1, _mm_mul_pd(a2_1, h_6_2));
-	t1 = _mm_add_pd(t1, _mm_mul_pd(a1_1, h_6_1));
-#endif
-	__m128d h_5_4 = _mm_loaddup_pd(&hh[(ldh*4)+1]);
-	__m128d h_5_3 = _mm_loaddup_pd(&hh[(ldh*4)+2]);
-	__m128d h_5_2 = _mm_loaddup_pd(&hh[(ldh*4)+3]);
-	__m128d h_5_1 = _mm_loaddup_pd(&hh[(ldh*4)+4]);
-#ifdef __ELPA_USE_FMA__
-	register __m128d v1 = _mm_macc_pd(a4_1, h_5_4, a5_1);
-	v1 = _mm_macc_pd(a3_1, h_5_3, v1);
-	v1 = _mm_macc_pd(a2_1, h_5_2, v1);
-	v1 = _mm_macc_pd(a1_1, h_5_1, v1);
-#else
-	register __m128d v1 = _mm_add_pd(a5_1, _mm_mul_pd(a4_1, h_5_4));
-	v1 = _mm_add_pd(v1, _mm_mul_pd(a3_1, h_5_3));
-	v1 = _mm_add_pd(v1, _mm_mul_pd(a2_1, h_5_2));
-	v1 = _mm_add_pd(v1, _mm_mul_pd(a1_1, h_5_1));
-#endif
-	__m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	__m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-	__m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-#ifdef __ELPA_USE_FMA__
-	register __m128d w1 = _mm_macc_pd(a3_1, h_4_3, a4_1);
-	w1 = _mm_macc_pd(a2_1, h_4_2, w1);
-	w1 = _mm_macc_pd(a1_1, h_4_1, w1);
-#else
-	register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1));
-#endif
-	__m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]);
-	__m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-	__m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-#ifdef __ELPA_USE_FMA__
-	register __m128d z1 = _mm_macc_pd(a2_1, h_3_2, a3_1);
-	z1 = _mm_macc_pd(a1_1, h_3_1, z1);
-	register __m128d y1 = _mm_macc_pd(a1_1, h_2_1, a2_1);
-#else
-	register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2));
-	z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1));
-	register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1));
-#endif
-	register __m128d x1 = a1_1;
-
-	__m128d a1_2 = _mm_load_pd(&q[(ldq*5)+2]);
-	__m128d a2_2 = _mm_load_pd(&q[(ldq*4)+2]);
-	__m128d a3_2 = _mm_load_pd(&q[(ldq*3)+2]);
-	__m128d a4_2 = _mm_load_pd(&q[(ldq*2)+2]);
-	__m128d a5_2 = _mm_load_pd(&q[(ldq)+2]);
-	__m128d a6_2 = _mm_load_pd(&q[2]);
-
-#ifdef __ELPA_USE_FMA__
-	register __m128d t2 = _mm_macc_pd(a5_2, h_6_5, a6_2);
-	t2 = _mm_macc_pd(a4_2, h_6_4, t2);
-	t2 = _mm_macc_pd(a3_2, h_6_3, t2);
-	t2 = _mm_macc_pd(a2_2, h_6_2, t2);
-	t2 = _mm_macc_pd(a1_2, h_6_1, t2);
-	register __m128d v2 = _mm_macc_pd(a4_2, h_5_4, a5_2);
-	v2 = _mm_macc_pd(a3_2, h_5_3, v2);
-	v2 = _mm_macc_pd(a2_2, h_5_2, v2);
-	v2 = _mm_macc_pd(a1_2, h_5_1, v2);
-	register __m128d w2 = _mm_macc_pd(a3_2, h_4_3, a4_2);
-	w2 = _mm_macc_pd(a2_2, h_4_2, w2);
-	w2 = _mm_macc_pd(a1_2, h_4_1, w2);
-	register __m128d z2 = _mm_macc_pd(a2_2, h_3_2, a3_2);
-	z2 = _mm_macc_pd(a1_2, h_3_1, z2);
-	register __m128d y2 = _mm_macc_pd(a1_2, h_2_1, a2_2);
-#else
-	register __m128d t2 = _mm_add_pd(a6_2, _mm_mul_pd(a5_2, h_6_5));
-	t2 = _mm_add_pd(t2, _mm_mul_pd(a4_2, h_6_4));
-	t2 = _mm_add_pd(t2, _mm_mul_pd(a3_2, h_6_3));
-	t2 = _mm_add_pd(t2, _mm_mul_pd(a2_2, h_6_2));
-	t2 = _mm_add_pd(t2, _mm_mul_pd(a1_2, h_6_1));
-	register __m128d v2 = _mm_add_pd(a5_2, _mm_mul_pd(a4_2, h_5_4));
-	v2 = _mm_add_pd(v2, _mm_mul_pd(a3_2, h_5_3));
-	v2 = _mm_add_pd(v2, _mm_mul_pd(a2_2, h_5_2));
-	v2 = _mm_add_pd(v2, _mm_mul_pd(a1_2, h_5_1));
-	register __m128d w2 = _mm_add_pd(a4_2, _mm_mul_pd(a3_2, h_4_3));
-	w2 = _mm_add_pd(w2, _mm_mul_pd(a2_2, h_4_2));
-	w2 = _mm_add_pd(w2, _mm_mul_pd(a1_2, h_4_1));
-	register __m128d z2 = _mm_add_pd(a3_2, _mm_mul_pd(a2_2, h_3_2));
-	z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1));
-	register __m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1));
-#endif
-	register __m128d x2 = a1_2;
-
-	__m128d q1;
-	__m128d q2;
-
-	__m128d h1;
-	__m128d h2;
-	__m128d h3;
-	__m128d h4;
-	__m128d h5;
-	__m128d h6;
-
-	for(i = 6; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-5]);
-		q1 = _mm_load_pd(&q[i*ldq]);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_macc_pd(q1, h1, x1);
-		x2 = _mm_macc_pd(q2, h1, x2);
-#else
-		x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-		x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-#endif
-		h2 = _mm_loaddup_pd(&hh[ldh+i-4]);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm_macc_pd(q1, h2, y1);
-		y2 = _mm_macc_pd(q2, h2, y2);
-#else
-		y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-		y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-#endif
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]);
-#ifdef __ELPA_USE_FMA__
-		z1 = _mm_macc_pd(q1, h3, z1);
-		z2 = _mm_macc_pd(q2, h3, z2);
-#else
-		z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-		z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
-#endif
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]);
-#ifdef __ELPA_USE_FMA__
-		w1 = _mm_macc_pd(q1, h4, w1);
-		w2 = _mm_macc_pd(q2, h4, w2);
-#else
-		w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
-		w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4));
-#endif
-		h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]);
-#ifdef __ELPA_USE_FMA__
-		v1 = _mm_macc_pd(q1, h5, v1);
-		v2 = _mm_macc_pd(q2, h5, v2);
-#else
-		v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5));
-		v2 = _mm_add_pd(v2, _mm_mul_pd(q2,h5));
-#endif
-		h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]);
-#ifdef __ELPA_USE_FMA__
-		t1 = _mm_macc_pd(q1, h6, t1);
-		t2 = _mm_macc_pd(q2, h6, t2);
-#else
-		t1 = _mm_add_pd(t1, _mm_mul_pd(q1,h6));
-		t2 = _mm_add_pd(t2, _mm_mul_pd(q2,h6));
-#endif
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-5]);
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-	x2 = _mm_macc_pd(q2, h1, x2);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-4]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_macc_pd(q1, h2, y1);
-	y2 = _mm_macc_pd(q2, h2, y2);
-#else
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-#endif
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm_macc_pd(q1, h3, z1);
-	z2 = _mm_macc_pd(q2, h3, z2);
-#else
-	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-	z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
-#endif
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm_macc_pd(q1, h4, w1);
-	w2 = _mm_macc_pd(q2, h4, w2);
-#else
-	w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
-	w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4));
-#endif
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	v1 = _mm_macc_pd(q1, h5, v1);
-	v2 = _mm_macc_pd(q2, h5, v2);
-#else
-	v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5));
-	v2 = _mm_add_pd(v2, _mm_mul_pd(q2,h5));
-#endif
-
-	h1 = _mm_loaddup_pd(&hh[nb-4]);
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-	x2 = _mm_macc_pd(q2, h1, x2);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_macc_pd(q1, h2, y1);
-	y2 = _mm_macc_pd(q2, h2, y2);
-#else
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-#endif
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm_macc_pd(q1, h3, z1);
-	z2 = _mm_macc_pd(q2, h3, z2);
-#else
-	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-	z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
-#endif
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm_macc_pd(q1, h4, w1);
-	w2 = _mm_macc_pd(q2, h4, w2);
-#else
-	w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
-	w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4));
-#endif
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-	x2 = _mm_macc_pd(q2, h1, x2);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_macc_pd(q1, h2, y1);
-	y2 = _mm_macc_pd(q2, h2, y2);
-#else
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-#endif
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm_macc_pd(q1, h3, z1);
-	z2 = _mm_macc_pd(q2, h3, z2);
-#else
-	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-	z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
-#endif
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-	q1 = _mm_load_pd(&q[(nb+3)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+3)*ldq)+2]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-	x2 = _mm_macc_pd(q2, h1, x2);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_macc_pd(q1, h2, y1);
-	y2 = _mm_macc_pd(q2, h2, y2);
-#else
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-#endif
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-	q1 = _mm_load_pd(&q[(nb+4)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+4)*ldq)+2]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-	x2 = _mm_macc_pd(q2, h1, x2);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Apply tau, correct wrong calculation using pre-calculated scalar products
-	/////////////////////////////////////////////////////
-
-	__m128d tau1 = _mm_loaddup_pd(&hh[0]);
-	x1 = _mm_mul_pd(x1, tau1);
-	x2 = _mm_mul_pd(x2, tau1);
-
-	__m128d tau2 = _mm_loaddup_pd(&hh[ldh]);
-	__m128d vs_1_2 = _mm_loaddup_pd(&scalarprods[0]);
-	h2 = _mm_mul_pd(tau2, vs_1_2);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_msub_pd(y1, tau2, _mm_mul_pd(x1,h2));
-	y2 = _mm_msub_pd(y2, tau2, _mm_mul_pd(x2,h2));
-#else
-	y1 = _mm_sub_pd(_mm_mul_pd(y1,tau2), _mm_mul_pd(x1,h2));
-	y2 = _mm_sub_pd(_mm_mul_pd(y2,tau2), _mm_mul_pd(x2,h2));
-#endif
-
-	__m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]);
-	__m128d vs_1_3 = _mm_loaddup_pd(&scalarprods[1]);
-	__m128d vs_2_3 = _mm_loaddup_pd(&scalarprods[2]);
-	h2 = _mm_mul_pd(tau3, vs_1_3);
-	h3 = _mm_mul_pd(tau3, vs_2_3);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm_msub_pd(z1, tau3, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)));
-	z2 = _mm_msub_pd(z2, tau3, _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2)));
-#else
-	z1 = _mm_sub_pd(_mm_mul_pd(z1,tau3), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)));
-	z2 = _mm_sub_pd(_mm_mul_pd(z2,tau3), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)));
-#endif
-
-	__m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]);
-	__m128d vs_1_4 = _mm_loaddup_pd(&scalarprods[3]);
-	__m128d vs_2_4 = _mm_loaddup_pd(&scalarprods[4]);
-	h2 = _mm_mul_pd(tau4, vs_1_4);
-	h3 = _mm_mul_pd(tau4, vs_2_4);
-	__m128d vs_3_4 = _mm_loaddup_pd(&scalarprods[5]);
-	h4 = _mm_mul_pd(tau4, vs_3_4);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm_msub_pd(w1, tau4, _mm_macc_pd(z1, h4, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))));
-	w2 = _mm_msub_pd(w2, tau4, _mm_macc_pd(z2, h4, _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2))));
-#else
-	w1 = _mm_sub_pd(_mm_mul_pd(w1,tau4), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))));
-	w2 = _mm_sub_pd(_mm_mul_pd(w2,tau4), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))));
-#endif
-
-	__m128d tau5 = _mm_loaddup_pd(&hh[ldh*4]);
-	__m128d vs_1_5 = _mm_loaddup_pd(&scalarprods[6]);
-	__m128d vs_2_5 = _mm_loaddup_pd(&scalarprods[7]);
-	h2 = _mm_mul_pd(tau5, vs_1_5);
-	h3 = _mm_mul_pd(tau5, vs_2_5);
-	__m128d vs_3_5 = _mm_loaddup_pd(&scalarprods[8]);
-	__m128d vs_4_5 = _mm_loaddup_pd(&scalarprods[9]);
-	h4 = _mm_mul_pd(tau5, vs_3_5);
-	h5 = _mm_mul_pd(tau5, vs_4_5);
-#ifdef __ELPA_USE_FMA__
-	v1 = _mm_msub_pd(v1, tau5, _mm_add_pd(_mm_macc_pd(w1, h5, _mm_mul_pd(z1,h4)), _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))));
-	v2 = _mm_msub_pd(v2, tau5, _mm_add_pd(_mm_macc_pd(w2, h5, _mm_mul_pd(z2,h4)), _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2))));
-#else
-	v1 = _mm_sub_pd(_mm_mul_pd(v1,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))));
-	v2 = _mm_sub_pd(_mm_mul_pd(v2,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2,h5), _mm_mul_pd(z2,h4)), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))));
-#endif
-
-	__m128d tau6 = _mm_loaddup_pd(&hh[ldh*5]);
-	__m128d vs_1_6 = _mm_loaddup_pd(&scalarprods[10]);
-	__m128d vs_2_6 = _mm_loaddup_pd(&scalarprods[11]);
-	h2 = _mm_mul_pd(tau6, vs_1_6);
-	h3 = _mm_mul_pd(tau6, vs_2_6);
-	__m128d vs_3_6 = _mm_loaddup_pd(&scalarprods[12]);
-	__m128d vs_4_6 = _mm_loaddup_pd(&scalarprods[13]);
-	__m128d vs_5_6 = _mm_loaddup_pd(&scalarprods[14]);
-	h4 = _mm_mul_pd(tau6, vs_3_6);
-	h5 = _mm_mul_pd(tau6, vs_4_6);
-	h6 = _mm_mul_pd(tau6, vs_5_6);
-#ifdef __ELPA_USE_FMA__
-	t1 = _mm_msub_pd(t1, tau6, _mm_macc_pd(v1, h6, _mm_add_pd(_mm_macc_pd(w1, h5, _mm_mul_pd(z1,h4)), _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)))));
-	t2 = _mm_msub_pd(t2, tau6, _mm_macc_pd(v2, h6, _mm_add_pd(_mm_macc_pd(w2, h5, _mm_mul_pd(z2,h4)), _mm_macc_pd(y2, h3, _mm_mul_pd(x2,h2)))));
-#else
-	t1 = _mm_sub_pd(_mm_mul_pd(t1,tau6), _mm_add_pd( _mm_mul_pd(v1,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))));
-	t2 = _mm_sub_pd(_mm_mul_pd(t2,tau6), _mm_add_pd( _mm_mul_pd(v2,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2,h5), _mm_mul_pd(z2,h4)), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [4 x nb+3]
-	/////////////////////////////////////////////////////
-
-	q1 = _mm_load_pd(&q[0]);
-	q2 = _mm_load_pd(&q[2]);
-	q1 = _mm_sub_pd(q1, t1);
-	q2 = _mm_sub_pd(q2, t2);
-	_mm_store_pd(&q[0],q1);
-	_mm_store_pd(&q[2],q2);
-
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+1]);
-	q1 = _mm_load_pd(&q[ldq]);
-	q2 = _mm_load_pd(&q[(ldq+2)]);
-	q1 = _mm_sub_pd(q1, v1);
-	q2 = _mm_sub_pd(q2, v2);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(t1, h6, q1);
-	q2 = _mm_nmacc_pd(t2, h6, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6));
-#endif
-	_mm_store_pd(&q[ldq],q1);
-	_mm_store_pd(&q[(ldq+2)],q2);
-
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+1]);
-	q1 = _mm_load_pd(&q[ldq*2]);
-	q2 = _mm_load_pd(&q[(ldq*2)+2]);
-	q1 = _mm_sub_pd(q1, w1);
-	q2 = _mm_sub_pd(q2, w2);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(v1, h5, q1);
-	q2 = _mm_nmacc_pd(v2, h5, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5));
-#endif
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(t1, h6, q1);
-	q2 = _mm_nmacc_pd(t2, h6, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6));
-#endif
-	_mm_store_pd(&q[ldq*2],q1);
-	_mm_store_pd(&q[(ldq*2)+2],q2);
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	q1 = _mm_load_pd(&q[ldq*3]);
-	q2 = _mm_load_pd(&q[(ldq*3)+2]);
-	q1 = _mm_sub_pd(q1, z1);
-	q2 = _mm_sub_pd(q2, z2);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(w1, h4, q1);
-	q2 = _mm_nmacc_pd(w2, h4, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
-#endif
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(v1, h5, q1);
-	q2 = _mm_nmacc_pd(v2, h5, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5));
-#endif
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(t1, h6, q1);
-	q2 = _mm_nmacc_pd(t2, h6, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6));
-#endif
-	_mm_store_pd(&q[ldq*3],q1);
-	_mm_store_pd(&q[(ldq*3)+2],q2);
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-	q1 = _mm_load_pd(&q[ldq*4]);
-	q2 = _mm_load_pd(&q[(ldq*4)+2]);
-	q1 = _mm_sub_pd(q1, y1);
-	q2 = _mm_sub_pd(q2, y2);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(z1, h3, q1);
-	q2 = _mm_nmacc_pd(z2, h3, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
-#endif
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(w1, h4, q1);
-	q2 = _mm_nmacc_pd(w2, h4, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
-#endif
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(v1, h5, q1);
-	q2 = _mm_nmacc_pd(v2, h5, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5));
-#endif
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(t1, h6, q1);
-	q2 = _mm_nmacc_pd(t2, h6, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6));
-#endif
-	_mm_store_pd(&q[ldq*4],q1);
-	_mm_store_pd(&q[(ldq*4)+2],q2);
-
-	h2 = _mm_loaddup_pd(&hh[(ldh)+1]);
-	q1 = _mm_load_pd(&q[ldq*5]);
-	q2 = _mm_load_pd(&q[(ldq*5)+2]);
-	q1 = _mm_sub_pd(q1, x1);
-	q2 = _mm_sub_pd(q2, x2);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(y1, h2, q1);
-	q2 = _mm_nmacc_pd(y2, h2, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
-#endif
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(z1, h3, q1);
-	q2 = _mm_nmacc_pd(z2, h3, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
-#endif
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(w1, h4, q1);
-	q2 = _mm_nmacc_pd(w2, h4, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
-#endif
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(v1, h5, q1);
-	q2 = _mm_nmacc_pd(v2, h5, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5));
-#endif
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+5]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(t1, h6, q1);
-	q2 = _mm_nmacc_pd(t2, h6, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6));
-#endif
-	_mm_store_pd(&q[ldq*5],q1);
-	_mm_store_pd(&q[(ldq*5)+2],q2);
-
-	for (i = 6; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q[i*ldq]);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		h1 = _mm_loaddup_pd(&hh[i-5]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_nmacc_pd(x1, h1, q1);
-		q2 = _mm_nmacc_pd(x2, h1, q2);
-#else
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-#endif
-		h2 = _mm_loaddup_pd(&hh[ldh+i-4]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_nmacc_pd(y1, h2, q1);
-		q2 = _mm_nmacc_pd(y2, h2, q2);
-#else
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
-#endif
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_nmacc_pd(z1, h3, q1);
-		q2 = _mm_nmacc_pd(z2, h3, q2);
-#else
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
-#endif
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_nmacc_pd(w1, h4, q1);
-		q2 = _mm_nmacc_pd(w2, h4, q2);
-#else
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
-#endif
-		h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_nmacc_pd(v1, h5, q1);
-		q2 = _mm_nmacc_pd(v2, h5, q2);
-#else
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5));
-#endif
-		h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_nmacc_pd(t1, h6, q1);
-		q2 = _mm_nmacc_pd(t2, h6, q2);
-#else
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6));
-#endif
-		_mm_store_pd(&q[i*ldq],q1);
-		_mm_store_pd(&q[(i*ldq)+2],q2);
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-5]);
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(x1, h1, q1);
-	q2 = _mm_nmacc_pd(x2, h1, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(y1, h2, q1);
-	q2 = _mm_nmacc_pd(y2, h2, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
-#endif
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(z1, h3, q1);
-	q2 = _mm_nmacc_pd(z2, h3, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
-#endif
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(w1, h4, q1);
-	q2 = _mm_nmacc_pd(w2, h4, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
-#endif
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(v1, h5, q1);
-	q2 = _mm_nmacc_pd(v2, h5, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5));
-#endif
-	_mm_store_pd(&q[nb*ldq],q1);
-	_mm_store_pd(&q[(nb*ldq)+2],q2);
-
-	h1 = _mm_loaddup_pd(&hh[nb-4]);
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(x1, h1, q1);
-	q2 = _mm_nmacc_pd(x2, h1, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(y1, h2, q1);
-	q2 = _mm_nmacc_pd(y2, h2, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
-#endif
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(z1, h3, q1);
-	q2 = _mm_nmacc_pd(z2, h3, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
-#endif
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(w1, h4, q1);
-	q2 = _mm_nmacc_pd(w2, h4, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
-#endif
-	_mm_store_pd(&q[(nb+1)*ldq],q1);
-	_mm_store_pd(&q[((nb+1)*ldq)+2],q2);
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(x1, h1, q1);
-	q2 = _mm_nmacc_pd(x2, h1, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(y1, h2, q1);
-	q2 = _mm_nmacc_pd(y2, h2, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
-#endif
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(z1, h3, q1);
-	q2 = _mm_nmacc_pd(z2, h3, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
-#endif
-	_mm_store_pd(&q[(nb+2)*ldq],q1);
-	_mm_store_pd(&q[((nb+2)*ldq)+2],q2);
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-	q1 = _mm_load_pd(&q[(nb+3)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+3)*ldq)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(x1, h1, q1);
-	q2 = _mm_nmacc_pd(x2, h1, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(y1, h2, q1);
-	q2 = _mm_nmacc_pd(y2, h2, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
-#endif
-	_mm_store_pd(&q[(nb+3)*ldq],q1);
-	_mm_store_pd(&q[((nb+3)*ldq)+2],q2);
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-	q1 = _mm_load_pd(&q[(nb+4)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+4)*ldq)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(x1, h1, q1);
-	q2 = _mm_nmacc_pd(x2, h1, q2);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-#endif
-	_mm_store_pd(&q[(nb+4)*ldq],q1);
-	_mm_store_pd(&q[((nb+4)*ldq)+2],q2);
-}
-
-/**
- * Unrolled kernel that computes
- * 2 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [2 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m128d a1_1 = _mm_load_pd(&q[ldq*5]);
-	__m128d a2_1 = _mm_load_pd(&q[ldq*4]);
-	__m128d a3_1 = _mm_load_pd(&q[ldq*3]);
-	__m128d a4_1 = _mm_load_pd(&q[ldq*2]);
-	__m128d a5_1 = _mm_load_pd(&q[ldq]);
-	__m128d a6_1 = _mm_load_pd(&q[0]);
-
-	__m128d h_6_5 = _mm_loaddup_pd(&hh[(ldh*5)+1]);
-	__m128d h_6_4 = _mm_loaddup_pd(&hh[(ldh*5)+2]);
-	__m128d h_6_3 = _mm_loaddup_pd(&hh[(ldh*5)+3]);
-	__m128d h_6_2 = _mm_loaddup_pd(&hh[(ldh*5)+4]);
-	__m128d h_6_1 = _mm_loaddup_pd(&hh[(ldh*5)+5]);
-#ifdef __ELPA_USE_FMA__
-	register __m128d t1 = _mm_macc_pd(a5_1, h_6_5, a6_1);
-	t1 = _mm_macc_pd(a4_1, h_6_4, t1);
-	t1 = _mm_macc_pd(a3_1, h_6_3, t1);
-	t1 = _mm_macc_pd(a2_1, h_6_2, t1);
-	t1 = _mm_macc_pd(a1_1, h_6_1, t1);
-#else
-	register __m128d t1 = _mm_add_pd(a6_1, _mm_mul_pd(a5_1, h_6_5));
-	t1 = _mm_add_pd(t1, _mm_mul_pd(a4_1, h_6_4));
-	t1 = _mm_add_pd(t1, _mm_mul_pd(a3_1, h_6_3));
-	t1 = _mm_add_pd(t1, _mm_mul_pd(a2_1, h_6_2));
-	t1 = _mm_add_pd(t1, _mm_mul_pd(a1_1, h_6_1));
-#endif
-	__m128d h_5_4 = _mm_loaddup_pd(&hh[(ldh*4)+1]);
-	__m128d h_5_3 = _mm_loaddup_pd(&hh[(ldh*4)+2]);
-	__m128d h_5_2 = _mm_loaddup_pd(&hh[(ldh*4)+3]);
-	__m128d h_5_1 = _mm_loaddup_pd(&hh[(ldh*4)+4]);
-#ifdef __ELPA_USE_FMA__
-	register __m128d v1 = _mm_macc_pd(a4_1, h_5_4, a5_1);
-	v1 = _mm_macc_pd(a3_1, h_5_3, v1);
-	v1 = _mm_macc_pd(a2_1, h_5_2, v1);
-	v1 = _mm_macc_pd(a1_1, h_5_1, v1);
-#else
-	register __m128d v1 = _mm_add_pd(a5_1, _mm_mul_pd(a4_1, h_5_4));
-	v1 = _mm_add_pd(v1, _mm_mul_pd(a3_1, h_5_3));
-	v1 = _mm_add_pd(v1, _mm_mul_pd(a2_1, h_5_2));
-	v1 = _mm_add_pd(v1, _mm_mul_pd(a1_1, h_5_1));
-#endif
-	__m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	__m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-	__m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-#ifdef __ELPA_USE_FMA__
-	register __m128d w1 = _mm_macc_pd(a3_1, h_4_3, a4_1);
-	w1 = _mm_macc_pd(a2_1, h_4_2, w1);
-	w1 = _mm_macc_pd(a1_1, h_4_1, w1);
-#else
-	register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1));
-#endif
-	__m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]);
-	__m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-	__m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-#ifdef __ELPA_USE_FMA__
-	register __m128d z1 = _mm_macc_pd(a2_1, h_3_2, a3_1);
-	z1 = _mm_macc_pd(a1_1, h_3_1, z1);
-	register __m128d y1 = _mm_macc_pd(a1_1, h_2_1, a2_1);
-#else
-	register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2));
-	z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1));
-	register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1));
-#endif
-	register __m128d x1 = a1_1;
-
-	__m128d q1;
-
-	__m128d h1;
-	__m128d h2;
-	__m128d h3;
-	__m128d h4;
-	__m128d h5;
-	__m128d h6;
-
-	for(i = 6; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-5]);
-		q1 = _mm_load_pd(&q[i*ldq]);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_macc_pd(q1, h1, x1);
-#else
-		x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-#endif
-		h2 = _mm_loaddup_pd(&hh[ldh+i-4]);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm_macc_pd(q1, h2, y1);
-#else
-		y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-#endif
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]);
-#ifdef __ELPA_USE_FMA__
-		z1 = _mm_macc_pd(q1, h3, z1);
-#else
-		z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-#endif
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]);
-#ifdef __ELPA_USE_FMA__
-		w1 = _mm_macc_pd(q1, h4, w1);
-#else
-		w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
-#endif
-		h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]);
-#ifdef __ELPA_USE_FMA__
-		v1 = _mm_macc_pd(q1, h5, v1);
-#else
-		v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5));
-#endif
-		h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]);
-#ifdef __ELPA_USE_FMA__
-		t1 = _mm_macc_pd(q1, h6, t1);
-#else
-		t1 = _mm_add_pd(t1, _mm_mul_pd(q1,h6));
-#endif
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-5]);
-	q1 = _mm_load_pd(&q[nb*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-4]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_macc_pd(q1, h2, y1);
-#else
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-#endif
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm_macc_pd(q1, h3, z1);
-#else
-	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-#endif
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm_macc_pd(q1, h4, w1);
-#else
-	w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
-#endif
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	v1 = _mm_macc_pd(q1, h5, v1);
-#else
-	v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5));
-#endif
-
-	h1 = _mm_loaddup_pd(&hh[nb-4]);
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_macc_pd(q1, h2, y1);
-#else
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-#endif
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm_macc_pd(q1, h3, z1);
-#else
-	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-#endif
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm_macc_pd(q1, h4, w1);
-#else
-	w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
-#endif
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_macc_pd(q1, h2, y1);
-#else
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-#endif
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm_macc_pd(q1, h3, z1);
-#else
-	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-#endif
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-	q1 = _mm_load_pd(&q[(nb+3)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_macc_pd(q1, h2, y1);
-#else
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-#endif
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-	q1 = _mm_load_pd(&q[(nb+4)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_macc_pd(q1, h1, x1);
-#else
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Apply tau, correct wrong calculation using pre-calculated scalar products
-	/////////////////////////////////////////////////////
-
-	__m128d tau1 = _mm_loaddup_pd(&hh[0]);
-	x1 = _mm_mul_pd(x1, tau1);
-
-	__m128d tau2 = _mm_loaddup_pd(&hh[ldh]);
-	__m128d vs_1_2 = _mm_loaddup_pd(&scalarprods[0]);
-	h2 = _mm_mul_pd(tau2, vs_1_2);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_msub_pd(y1, tau2, _mm_mul_pd(x1,h2));
-#else
-	y1 = _mm_sub_pd(_mm_mul_pd(y1,tau2), _mm_mul_pd(x1,h2));
-#endif
-
-	__m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]);
-	__m128d vs_1_3 = _mm_loaddup_pd(&scalarprods[1]);
-	__m128d vs_2_3 = _mm_loaddup_pd(&scalarprods[2]);
-	h2 = _mm_mul_pd(tau3, vs_1_3);
-	h3 = _mm_mul_pd(tau3, vs_2_3);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm_msub_pd(z1, tau3, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)));
-#else
-	z1 = _mm_sub_pd(_mm_mul_pd(z1,tau3), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)));
-#endif
-
-	__m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]);
-	__m128d vs_1_4 = _mm_loaddup_pd(&scalarprods[3]);
-	__m128d vs_2_4 = _mm_loaddup_pd(&scalarprods[4]);
-	h2 = _mm_mul_pd(tau4, vs_1_4);
-	h3 = _mm_mul_pd(tau4, vs_2_4);
-	__m128d vs_3_4 = _mm_loaddup_pd(&scalarprods[5]);
-	h4 = _mm_mul_pd(tau4, vs_3_4);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm_msub_pd(w1, tau4, _mm_macc_pd(z1, h4, _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))));
-#else
-	w1 = _mm_sub_pd(_mm_mul_pd(w1,tau4), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))));
-#endif
-
-	__m128d tau5 = _mm_loaddup_pd(&hh[ldh*4]);
-	__m128d vs_1_5 = _mm_loaddup_pd(&scalarprods[6]);
-	__m128d vs_2_5 = _mm_loaddup_pd(&scalarprods[7]);
-	h2 = _mm_mul_pd(tau5, vs_1_5);
-	h3 = _mm_mul_pd(tau5, vs_2_5);
-	__m128d vs_3_5 = _mm_loaddup_pd(&scalarprods[8]);
-	__m128d vs_4_5 = _mm_loaddup_pd(&scalarprods[9]);
-	h4 = _mm_mul_pd(tau5, vs_3_5);
-	h5 = _mm_mul_pd(tau5, vs_4_5);
-#ifdef __ELPA_USE_FMA__
-	v1 = _mm_msub_pd(v1, tau5, _mm_add_pd(_mm_macc_pd(w1, h5, _mm_mul_pd(z1,h4)), _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2))));
-#else
-	v1 = _mm_sub_pd(_mm_mul_pd(v1,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))));
-#endif
-
-	__m128d tau6 = _mm_loaddup_pd(&hh[ldh*5]);
-	__m128d vs_1_6 = _mm_loaddup_pd(&scalarprods[10]);
-	__m128d vs_2_6 = _mm_loaddup_pd(&scalarprods[11]);
-	h2 = _mm_mul_pd(tau6, vs_1_6);
-	h3 = _mm_mul_pd(tau6, vs_2_6);
-	__m128d vs_3_6 = _mm_loaddup_pd(&scalarprods[12]);
-	__m128d vs_4_6 = _mm_loaddup_pd(&scalarprods[13]);
-	__m128d vs_5_6 = _mm_loaddup_pd(&scalarprods[14]);
-	h4 = _mm_mul_pd(tau6, vs_3_6);
-	h5 = _mm_mul_pd(tau6, vs_4_6);
-	h6 = _mm_mul_pd(tau6, vs_5_6);
-#ifdef __ELPA_USE_FMA__
-	t1 = _mm_msub_pd(t1, tau6, _mm_macc_pd(v1, h6, _mm_add_pd(_mm_macc_pd(w1, h5, _mm_mul_pd(z1,h4)), _mm_macc_pd(y1, h3, _mm_mul_pd(x1,h2)))));
-#else
-	t1 = _mm_sub_pd(_mm_mul_pd(t1,tau6), _mm_add_pd( _mm_mul_pd(v1,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [2 x nb+3]
-	/////////////////////////////////////////////////////
-
-	q1 = _mm_load_pd(&q[0]);
-	q1 = _mm_sub_pd(q1, t1);
-	_mm_store_pd(&q[0],q1);
-
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+1]);
-	q1 = _mm_load_pd(&q[ldq]);
-	q1 = _mm_sub_pd(q1, v1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(t1, h6, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-#endif
-	_mm_store_pd(&q[ldq],q1);
-
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+1]);
-	q1 = _mm_load_pd(&q[ldq*2]);
-	q1 = _mm_sub_pd(q1, w1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(v1, h5, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-#endif
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(t1, h6, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-#endif
-	_mm_store_pd(&q[ldq*2],q1);
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	q1 = _mm_load_pd(&q[ldq*3]);
-	q1 = _mm_sub_pd(q1, z1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(w1, h4, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-#endif
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(v1, h5, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-#endif
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(t1, h6, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-#endif
-	_mm_store_pd(&q[ldq*3],q1);
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-	q1 = _mm_load_pd(&q[ldq*4]);
-	q1 = _mm_sub_pd(q1, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(z1, h3, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-#endif
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(w1, h4, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-#endif
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(v1, h5, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-#endif
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(t1, h6, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-#endif
-	_mm_store_pd(&q[ldq*4],q1);
-
-	h2 = _mm_loaddup_pd(&hh[(ldh)+1]);
-	q1 = _mm_load_pd(&q[ldq*5]);
-	q1 = _mm_sub_pd(q1, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(y1, h2, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-#endif
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(z1, h3, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-#endif
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(w1, h4, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-#endif
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(v1, h5, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-#endif
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+5]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(t1, h6, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-#endif
-	_mm_store_pd(&q[ldq*5],q1);
-
-	for (i = 6; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q[i*ldq]);
-		h1 = _mm_loaddup_pd(&hh[i-5]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_nmacc_pd(x1, h1, q1);
-#else
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-#endif
-		h2 = _mm_loaddup_pd(&hh[ldh+i-4]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_nmacc_pd(y1, h2, q1);
-#else
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-#endif
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_nmacc_pd(z1, h3, q1);
-#else
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-#endif
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_nmacc_pd(w1, h4, q1);
-#else
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-#endif
-		h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_nmacc_pd(v1, h5, q1);
-#else
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-#endif
-		h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_nmacc_pd(t1, h6, q1);
-#else
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-#endif
-		_mm_store_pd(&q[i*ldq],q1);
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-5]);
-	q1 = _mm_load_pd(&q[nb*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(x1, h1, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(y1, h2, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-#endif
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(z1, h3, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-#endif
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(w1, h4, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-#endif
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(v1, h5, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-#endif
-	_mm_store_pd(&q[nb*ldq],q1);
-
-	h1 = _mm_loaddup_pd(&hh[nb-4]);
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(x1, h1, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(y1, h2, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-#endif
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(z1, h3, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-#endif
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(w1, h4, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-#endif
-	_mm_store_pd(&q[(nb+1)*ldq],q1);
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(x1, h1, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(y1, h2, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-#endif
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(z1, h3, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-#endif
-	_mm_store_pd(&q[(nb+2)*ldq],q1);
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-	q1 = _mm_load_pd(&q[(nb+3)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(x1, h1, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-#endif
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(y1, h2, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-#endif
-	_mm_store_pd(&q[(nb+3)*ldq],q1);
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-	q1 = _mm_load_pd(&q[(nb+4)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_nmacc_pd(x1, h1, q1);
-#else
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-#endif
-	_mm_store_pd(&q[(nb+4)*ldq],q1);
-}
-#endif
diff --git a/src/elpa2_kernels/mod_single_hh_trafo_real.F90 b/src/elpa2_kernels/mod_single_hh_trafo_real.F90
deleted file mode 100644
index d48def112..000000000
--- a/src/elpa2_kernels/mod_single_hh_trafo_real.F90
+++ /dev/null
@@ -1,67 +0,0 @@
-module single_hh_trafo_real
-  implicit none
-#include "config-f90.h"
-
-#ifdef WITH_OPENMP
-  public single_hh_trafo_real_cpu_openmp
-#else
-  public single_hh_trafo_real_cpu
-#endif
-  contains
-
-#ifdef WITH_OPENMP
-    subroutine single_hh_trafo_real_cpu_openmp(q, hh, nb, nq, ldq)
-#else
-    subroutine single_hh_trafo_real_cpu(q, hh, nb, nq, ldq)
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      ! Perform single real Householder transformation.
-      ! This routine is not performance critical and thus it is coded here in Fortran
-
-      implicit none
-      integer(kind=ik), intent(in)   :: nb, nq, ldq
-!      real(kind=rk), intent(inout)   :: q(ldq, *)
-!      real(kind=rk), intent(in)      :: hh(*)
-      real(kind=rk), intent(inout)   :: q(1:ldq, 1:nb)
-      real(kind=rk), intent(in)      :: hh(1:nb)
-      integer(kind=ik)               :: i
-      real(kind=rk)                  :: v(nq)
-
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-      call timer%start("single_hh_trafo_real_cpu_openmp")
-#else
-      call timer%start("single_hh_trafo_real_cpu")
-#endif
-#endif
-
-      ! v = q * hh
-      v(:) = q(1:nq,1)
-      do i=2,nb
-        v(:) = v(:) + q(1:nq,i) * hh(i)
-      enddo
-
-      ! v = v * tau
-      v(:) = v(:) * hh(1)
-
-      ! q = q - v * hh**T
-      q(1:nq,1) = q(1:nq,1) - v(:)
-      do i=2,nb
-        q(1:nq,i) = q(1:nq,i) - v(:) * hh(i)
-      enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-      call timer%stop("single_hh_trafo_real_cpu_openmp")
-#else
-      call timer%stop("single_hh_trafo_real_cpu")
-#endif
-#endif
-    end subroutine
-
-
-end module
diff --git a/src/elpa2_utilities.F90 b/src/elpa2_utilities.F90
deleted file mode 100644
index fedc513f1..000000000
--- a/src/elpa2_utilities.F90
+++ /dev/null
@@ -1,525 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-
-
-! ELPA2 -- 2-stage solver for ELPA
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-
-#include "config-f90.h"
-#include <elpa/elpa_kernel_constants.h>
-
-module ELPA2_utilities
-  use ELPA_utilities
-  implicit none
-
-  PRIVATE ! By default, all routines contained are private
-
-  ! The following routines are public:
-
-  public :: get_actual_real_kernel_name, get_actual_complex_kernel_name
-  public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GENERIC_SIMPLE, &
-            REAL_ELPA_KERNEL_BGP, REAL_ELPA_KERNEL_BGQ,                &
-            REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_AVX_BLOCK2,         &
-            REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6
-
-  public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, &
-            COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ,                &
-            COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_AVX_BLOCK1,         &
-            COMPLEX_ELPA_KERNEL_AVX_BLOCK2
-
-  public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES
-
-  public :: get_actual_complex_kernel, get_actual_real_kernel
-
-  public :: check_allowed_complex_kernels, check_allowed_real_kernels
-
-  public :: AVAILABLE_COMPLEX_ELPA_KERNELS, AVAILABLE_REAL_ELPA_KERNELS
-
-  public :: print_available_real_kernels, print_available_complex_kernels
-
-  public :: qr_decomposition_via_environment_variable
-
-  integer, parameter :: number_of_real_kernels           = ELPA2_NUMBER_OF_REAL_KERNELS
-  integer, parameter :: REAL_ELPA_KERNEL_GENERIC         = ELPA2_REAL_KERNEL_GENERIC
-  integer, parameter :: REAL_ELPA_KERNEL_GENERIC_SIMPLE  = ELPA2_REAL_KERNEL_GENERIC_SIMPLE
-  integer, parameter :: REAL_ELPA_KERNEL_BGP             = ELPA2_REAL_KERNEL_BGP
-  integer, parameter :: REAL_ELPA_KERNEL_BGQ             = ELPA2_REAL_KERNEL_BGQ
-  integer, parameter :: REAL_ELPA_KERNEL_SSE             = ELPA2_REAL_KERNEL_SSE
-  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2      = ELPA2_REAL_KERNEL_AVX_BLOCK2
-  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4      = ELPA2_REAL_KERNEL_AVX_BLOCK4
-  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6      = ELPA2_REAL_KERNEL_AVX_BLOCK6
-
-#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
-#else
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
-#endif
-  character(35), parameter, dimension(number_of_real_kernels) :: &
-  REAL_ELPA_KERNEL_NAMES =    (/"REAL_ELPA_KERNEL_GENERIC         ", &
-                                "REAL_ELPA_KERNEL_GENERIC_SIMPLE  ", &
-                                "REAL_ELPA_KERNEL_BGP             ", &
-                                "REAL_ELPA_KERNEL_BGQ             ", &
-                                "REAL_ELPA_KERNEL_SSE             ", &
-                                "REAL_ELPA_KERNEL_AVX_BLOCK2      ", &
-                                "REAL_ELPA_KERNEL_AVX_BLOCK4      ", &
-                                "REAL_ELPA_KERNEL_AVX_BLOCK6      "/)
-
-  integer, parameter :: number_of_complex_kernels           = ELPA2_NUMBER_OF_COMPLEX_KERNELS
-  integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC         = ELPA2_COMPLEX_KERNEL_GENERIC
-  integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE  = ELPA2_COMPLEX_KERNEL_GENERIC_SIMPLE
-  integer, parameter :: COMPLEX_ELPA_KERNEL_BGP             = ELPA2_COMPLEX_KERNEL_BGP
-  integer, parameter :: COMPLEX_ELPA_KERNEL_BGQ             = ELPA2_COMPLEX_KERNEL_BGQ
-  integer, parameter :: COMPLEX_ELPA_KERNEL_SSE             = ELPA2_COMPLEX_KERNEL_SSE
-  integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK1      = ELPA2_COMPLEX_KERNEL_AVX_BLOCK1
-  integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK2      = ELPA2_COMPLEX_KERNEL_AVX_BLOCK2
-
-#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
-  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
-#else
-  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
-#endif
-  character(35), parameter, dimension(number_of_complex_kernels) :: &
-  COMPLEX_ELPA_KERNEL_NAMES = (/"COMPLEX_ELPA_KERNEL_GENERIC         ", &
-                                "COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE  ", &
-                                "COMPLEX_ELPA_KERNEL_BGP             ", &
-                                "COMPLEX_ELPA_KERNEL_BGQ             ", &
-                                "COMPLEX_ELPA_KERNEL_SSE             ", &
-                                "COMPLEX_ELPA_KERNEL_AVX_BLOCK1      ", &
-                                "COMPLEX_ELPA_KERNEL_AVX_BLOCK2      "/)
-
-  integer, parameter                                    ::             &
-           AVAILABLE_REAL_ELPA_KERNELS(number_of_real_kernels) =       &
-                                      (/                               &
-#if WITH_REAL_GENERIC_KERNEL
-                                        1                              &
-#else
-                                        0                              &
-#endif
-#if WITH_REAL_GENERIC_SIMPLE_KERNEL
-                                          ,1                           &
-#else
-                                          ,0                           &
-#endif
-#if WITH_REAL_BGP_KERNEL
-                                            ,1                         &
-#else
-                                            ,0                         &
-#endif
-#if WITH_REAL_BGQ_KERNEL
-                                              ,1                       &
-#else
-                                              ,0                       &
-#endif
-#if WITH_REAL_SSE_KERNEL
-                                                ,1                     &
-#else
-                                                ,0                     &
-#endif
-#if WITH_REAL_AVX_BLOCK2_KERNEL
-                                                  ,1                   &
-#else
-                                                  ,0                   &
-#endif
-#if WITH_REAL_AVX_BLOCK4_KERNEL
-                                                    ,1                 &
-#else
-                                                    ,0                 &
-#endif
-#if WITH_REAL_AVX_BLOCK6_KERNEL
-                                                      ,1               &
-#else
-                                                      ,0               &
-#endif
-                                                       /)
-
-  integer, parameter ::                                                   &
-           AVAILABLE_COMPLEX_ELPA_KERNELS(number_of_complex_kernels) =    &
-                                      (/                                  &
-#if WITH_COMPLEX_GENERIC_KERNEL
-                                        1                                 &
-#else
-                                        0                                 &
-#endif
-#if WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
-                                          ,1                              &
-#else
-                                          ,0                              &
-#endif
-#if WITH_COMPLEX_BGP_KERNEL
-                                            ,1                            &
-#else
-                                            ,0                            &
-#endif
-#if WITH_COMPLEX_BGQ_KERNEL
-                                              ,1                          &
-#else
-                                              ,0                          &
-#endif
-#if WITH_COMPLEX_SSE_KERNEL
-                                                ,1                        &
-#else
-                                                ,0                        &
-#endif
-#if WITH_COMPLEX_AVX_BLOCK1_KERNEL
-                                                  ,1                      &
-#else
-                                                  ,0                      &
-#endif
-#if WITH_COMPLEX_AVX_BLOCK2_KERNEL
-                                                    ,1                    &
-#else
-                                                    ,0                    &
-#endif
-                                                   /)
-
-!******
-  contains
-
-    subroutine print_available_real_kernels
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      integer :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("print_available_real_kernels")
-#endif
-
-      do i=1, number_of_real_kernels
-        if (AVAILABLE_REAL_ELPA_KERNELS(i) .eq. 1) then
-          write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
-        endif
-      enddo
-      write(error_unit,*) " "
-      write(error_unit,*) " At the moment the following kernel would be choosen:"
-      write(error_unit,*) get_actual_real_kernel_name()
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("print_available_real_kernels")
-#endif
-
-    end subroutine print_available_real_kernels
-
-    subroutine print_available_complex_kernels
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-
-      implicit none
-
-      integer :: i
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("print_available_complex_kernels")
-#endif
-
-      do i=1, number_of_complex_kernels
-        if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .eq. 1) then
-           write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
-        endif
-      enddo
-      write(error_unit,*) " "
-      write(error_unit,*) " At the moment the following kernel would be choosen:"
-      write(error_unit,*) get_actual_complex_kernel_name()
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("print_available_complex_kernels")
-#endif
-
-    end subroutine print_available_complex_kernels
-
-    function get_actual_real_kernel() result(actual_kernel)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      integer :: actual_kernel
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("get_actual_real_kernel")
-#endif
-
-
-      ! if kernel is not choosen via api
-      ! check whether set by environment variable
-      actual_kernel = real_kernel_via_environment_variable()
-
-      if (actual_kernel .eq. 0) then
-        ! if not then set default kernel
-        actual_kernel = DEFAULT_REAL_ELPA_KERNEL
-      endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("get_actual_real_kernel")
-#endif
-
-    end function get_actual_real_kernel
-
-    function get_actual_real_kernel_name() result(actual_kernel_name)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      character(35) :: actual_kernel_name
-      integer       :: actual_kernel
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("get_actual_real_kernel_name")
-#endif
-
-      actual_kernel = get_actual_real_kernel()
-      actual_kernel_name = REAL_ELPA_KERNEL_NAMES(actual_kernel)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("get_actual_real_kernel_name")
-#endif
-
-    end function get_actual_real_kernel_name
-
-    function get_actual_complex_kernel() result(actual_kernel)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-      integer :: actual_kernel
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("get_actual_complex_kernel")
-#endif
-
-
-     ! if kernel is not choosen via api
-     ! check whether set by environment variable
-     actual_kernel = complex_kernel_via_environment_variable()
-
-     if (actual_kernel .eq. 0) then
-       ! if not then set default kernel
-       actual_kernel = DEFAULT_COMPLEX_ELPA_KERNEL
-     endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop("get_actual_complex_kernel")
-#endif
-
-   end function get_actual_complex_kernel
-
-   function get_actual_complex_kernel_name() result(actual_kernel_name)
-#ifdef HAVE_DETAILED_TIMINGS
-     use timings
-#endif
-     implicit none
-     character(35) :: actual_kernel_name
-     integer       :: actual_kernel
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%start("get_actual_complex_kernel_name")
-#endif
-
-     actual_kernel = get_actual_complex_kernel()
-     actual_kernel_name = COMPLEX_ELPA_KERNEL_NAMES(actual_kernel)
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop("get_actual_complex_kernel_name")
-#endif
-
-   end function get_actual_complex_kernel_name
-
-   function check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL) result(err)
-#ifdef HAVE_DETAILED_TIMINGS
-     use timings
-#endif
-     implicit none
-     integer, intent(in) :: THIS_REAL_ELPA_KERNEL
-
-     logical             :: err
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%start("check_allowed_real_kernels")
-#endif
-     err = .false.
-
-     if (AVAILABLE_REAL_ELPA_KERNELS(THIS_REAL_ELPA_KERNEL) .ne. 1) err=.true.
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop("check_allowed_real_kernels")
-#endif
-
-   end function check_allowed_real_kernels
-
-   function check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL) result(err)
-#ifdef HAVE_DETAILED_TIMINGS
-     use timings
-#endif
-     implicit none
-     integer, intent(in) :: THIS_COMPLEX_ELPA_KERNEL
-
-     logical             :: err
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%start("check_allowed_complex_kernels")
-#endif
-     err = .false.
-
-     if (AVAILABLE_COMPLEX_ELPA_KERNELS(THIS_COMPLEX_ELPA_KERNEL) .ne. 1) err=.true.
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop("check_allowed_complex_kernels")
-#endif
-
-   end function check_allowed_complex_kernels
-
-   function qr_decomposition_via_environment_variable(useQR) result(isSet)
-#ifdef HAVE_DETAILED_TIMINGS
-     use timings
-#endif
-     implicit none
-     logical, intent(out) :: useQR
-     logical              :: isSet
-     CHARACTER(len=255)   :: ELPA_QR_DECOMPOSITION
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%start("qr_decomposition_via_environment_variable")
-#endif
-
-     isSet = .false.
-
-#if defined(HAVE_ENVIRONMENT_CHECKING)
-     call get_environment_variable("ELPA_QR_DECOMPOSITION",ELPA_QR_DECOMPOSITION)
-#endif
-     if (trim(ELPA_QR_DECOMPOSITION) .eq. "yes") then
-       useQR = .true.
-       isSet = .true.
-     endif
-     if (trim(ELPA_QR_DECOMPOSITION) .eq. "no") then
-       useQR = .false.
-       isSet = .true.
-     endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop("qr_decomposition_via_environment_variable")
-#endif
-
-   end function qr_decomposition_via_environment_variable
-
-
-   function real_kernel_via_environment_variable() result(kernel)
-#ifdef HAVE_DETAILED_TIMINGS
-     use timings
-#endif
-     implicit none
-     integer :: kernel
-     CHARACTER(len=255) :: REAL_KERNEL_ENVIRONMENT
-     integer :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%start("real_kernel_via_environment_variable")
-#endif
-
-#if defined(HAVE_ENVIRONMENT_CHECKING)
-     call get_environment_variable("REAL_ELPA_KERNEL",REAL_KERNEL_ENVIRONMENT)
-#endif
-     do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
-       !     if (trim(dummy_char) .eq. trim(REAL_ELPA_KERNEL_NAMES(i))) then
-       if (trim(REAL_KERNEL_ENVIRONMENT) .eq. trim(REAL_ELPA_KERNEL_NAMES(i))) then
-         kernel = i
-         exit
-       else
-         kernel = 0
-       endif
-     enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop("real_kernel_via_environment_variable")
-#endif
-
-   end function real_kernel_via_environment_variable
-
-   function complex_kernel_via_environment_variable() result(kernel)
-#ifdef HAVE_DETAILED_TIMINGS
-     use timings
-#endif
-     implicit none
-     integer :: kernel
-
-     CHARACTER(len=255) :: COMPLEX_KERNEL_ENVIRONMENT
-     integer :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%start("complex_kernel_via_environment_variable")
-#endif
-
-#if defined(HAVE_ENVIRONMENT_CHECKING)
-     call get_environment_variable("COMPLEX_ELPA_KERNEL",COMPLEX_KERNEL_ENVIRONMENT)
-#endif
-
-     do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
-       if (trim(COMPLEX_ELPA_KERNEL_NAMES(i)) .eq. trim(COMPLEX_KERNEL_ENVIRONMENT)) then
-         kernel = i
-         exit
-       else
-         kernel = 0
-       endif
-     enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop("complex_kernel_via_environment_variable")
-#endif
-
-   end function
-!-------------------------------------------------------------------------------
-
-end module ELPA2_utilities
diff --git a/src/elpa_c_interface.F90 b/src/elpa_c_interface.F90
deleted file mode 100644
index c4d14e41b..000000000
--- a/src/elpa_c_interface.F90
+++ /dev/null
@@ -1,318 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-#include "config-f90.h"
-  !c> #include <complex.h>
-
-  !c> /*! \brief C old, deprecated interface to create the MPI communicators for ELPA
-  !c> *
-  !c> * \param mpi_comm_word    MPI global communicator (in)
-  !c> * \param my_prow          Row coordinate of the calling process in the process grid (in)
-  !c> * \param my_pcol          Column coordinate of the calling process in the process grid (in)
-  !c> * \param mpi_comm_rows    Communicator for communicating within rows of processes (out)
-  !c> * \result int             integer error value of mpi_comm_split function
-  !c> */
-  !c> int elpa_get_communicators(int mpi_comm_world, int my_prow, int my_pcol, int *mpi_comm_rows, int *mpi_comm_cols);
-  function get_elpa_row_col_comms_wrapper_c_name1(mpi_comm_world, my_prow, my_pcol, &
-                                          mpi_comm_rows, mpi_comm_cols)     &
-                                          result(mpierr) bind(C,name="elpa_get_communicators")
-    use, intrinsic :: iso_c_binding
-    use elpa1, only : get_elpa_row_col_comms
-
-    implicit none
-    integer(kind=c_int)         :: mpierr
-    integer(kind=c_int), value  :: mpi_comm_world, my_prow, my_pcol
-    integer(kind=c_int)         :: mpi_comm_rows, mpi_comm_cols
-
-    mpierr = get_elpa_row_col_comms(mpi_comm_world, my_prow, my_pcol, &
-                                    mpi_comm_rows, mpi_comm_cols)
-
-  end function
-  !c> #include <complex.h>
-
-  !c> /*! \brief C interface to create the MPI communicators for ELPA
-  !c> *
-  !c> * \param mpi_comm_word    MPI global communicator (in)
-  !c> * \param my_prow          Row coordinate of the calling process in the process grid (in)
-  !c> * \param my_pcol          Column coordinate of the calling process in the process grid (in)
-  !c> * \param mpi_comm_rows    Communicator for communicating within rows of processes (out)
-  !c> * \result int             integer error value of mpi_comm_split function
-  !c> */
-  !c> int get_elpa_communicators(int mpi_comm_world, int my_prow, int my_pcol, int *mpi_comm_rows, int *mpi_comm_cols);
-  function get_elpa_row_col_comms_wrapper_c_name2(mpi_comm_world, my_prow, my_pcol, &
-                                          mpi_comm_rows, mpi_comm_cols)     &
-                                          result(mpierr) bind(C,name="get_elpa_communicators")
-    use, intrinsic :: iso_c_binding
-    use elpa1, only : get_elpa_row_col_comms
-
-    implicit none
-    integer(kind=c_int)         :: mpierr
-    integer(kind=c_int), value  :: mpi_comm_world, my_prow, my_pcol
-    integer(kind=c_int)         :: mpi_comm_rows, mpi_comm_cols
-
-    mpierr = get_elpa_row_col_comms(mpi_comm_world, my_prow, my_pcol, &
-                                    mpi_comm_rows, mpi_comm_cols)
-
-  end function
-
-
-
-  !c>  /*! \brief C interface to solve the real eigenvalue problem with 1-stage solver
-  !c>  *
-  !c> *  \param  na                   Order of matrix a
-  !c> *  \param  nev                  Number of eigenvalues needed.
-  !c> *                               The smallest nev eigenvalues/eigenvectors are calculated.
-  !c> *  \param  a                    Distributed matrix for which eigenvalues are to be computed.
-  !c> *                               Distribution is like in Scalapack.
-  !c> *                               The full matrix must be set (not only one half like in scalapack).
-  !c> *  \param lda                   Leading dimension of a
-  !c> *  \param ev(na)                On output: eigenvalues of a, every processor gets the complete set
-  !c> *  \param q                     On output: Eigenvectors of a
-  !c> *                               Distribution is like in Scalapack.
-  !c> *                               Must be always dimensioned to the full size (corresponding to (na,na))
-  !c> *                               even if only a part of the eigenvalues is needed.
-  !c> *  \param ldq                   Leading dimension of q
-  !c> *  \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
-  !c> *  \param matrixCols           distributed number of matrix columns
-  !c> *  \param mpi_comm_rows        MPI-Communicator for rows
-  !c> *  \param mpi_comm_cols        MPI-Communicator for columns
-  !c> *
-  !c> *  \result                     int: 1 if error occured, otherwise 0
-  !c>*/
-  !c> int elpa_solve_evp_real_1stage(int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols);
-  function solve_elpa1_evp_real_wrapper(na, nev, a, lda, ev, q, ldq, nblk, &
-                                  matrixCols, mpi_comm_rows, mpi_comm_cols)      &
-                                  result(success) bind(C,name="elpa_solve_evp_real_1stage")
-
-    use, intrinsic :: iso_c_binding
-    use elpa1, only : solve_evp_real
-
-    implicit none
-    integer(kind=c_int)                    :: success
-    integer(kind=c_int), value, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_cols, mpi_comm_rows
-    real(kind=c_double)                    :: a(1:lda,1:matrixCols), ev(1:na), q(1:ldq,1:matrixCols)
-
-    logical                                :: successFortran
-
-    successFortran = solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
-
-    if (successFortran) then
-      success = 1
-    else
-      success = 0
-    endif
-
-  end function
-
-
-  !c> /*! \brief C interface to solve the complex eigenvalue problem with 1-stage solver
-  !c> *
-  !c> *  \param  na                   Order of matrix a
-  !c> *  \param  nev                  Number of eigenvalues needed.
-  !c> *                               The smallest nev eigenvalues/eigenvectors are calculated.
-  !c> *  \param  a                    Distributed matrix for which eigenvalues are to be computed.
-  !c> *                               Distribution is like in Scalapack.
-  !c> *                               The full matrix must be set (not only one half like in scalapack).
-  !c> *  \param lda                   Leading dimension of a
-  !c> *  \param ev(na)                On output: eigenvalues of a, every processor gets the complete set
-  !c> *  \param q                     On output: Eigenvectors of a
-  !c> *                               Distribution is like in Scalapack.
-  !c> *                               Must be always dimensioned to the full size (corresponding to (na,na))
-  !c> *                               even if only a part of the eigenvalues is needed.
-  !c> *  \param ldq                   Leading dimension of q
-  !c> *  \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
-  !c> *  \param matrixCols           distributed number of matrix columns
-  !c> *  \param mpi_comm_rows        MPI-Communicator for rows
-  !c> *  \param mpi_comm_cols        MPI-Communicator for columns
-  !c> *
-  !c> *  \result                     int: 1 if error occured, otherwise 0
-  !c> */
-  !c> int elpa_solve_evp_complex_1stage(int na, int nev, double complex *a, int lda, double *ev, double complex *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols);
-  function solve_evp_real_wrapper(na, nev, a, lda, ev, q, ldq, nblk, &
-                                  matrixCols, mpi_comm_rows, mpi_comm_cols)      &
-                                  result(success) bind(C,name="elpa_solve_evp_complex_1stage")
-
-    use, intrinsic :: iso_c_binding
-    use elpa1, only : solve_evp_complex
-
-    implicit none
-    integer(kind=c_int)                    :: success
-    integer(kind=c_int), value, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_cols, mpi_comm_rows
-    complex(kind=c_double_complex)         :: a(1:lda,1:matrixCols), q(1:ldq,1:matrixCols)
-    real(kind=c_double)                    :: ev(1:na)
-
-    logical                                :: successFortran
-
-    successFortran = solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
-
-    if (successFortran) then
-      success = 1
-    else
-      success = 0
-    endif
-
-  end function
-  !c> /*! \brief C interface to solve the real eigenvalue problem with 2-stage solver
-  !c> *
-  !c> *  \param  na                        Order of matrix a
-  !c> *  \param  nev                       Number of eigenvalues needed.
-  !c> *                                    The smallest nev eigenvalues/eigenvectors are calculated.
-  !c> *  \param  a                         Distributed matrix for which eigenvalues are to be computed.
-  !c> *                                    Distribution is like in Scalapack.
-  !c> *                                    The full matrix must be set (not only one half like in scalapack).
-  !c> *  \param lda                        Leading dimension of a
-  !c> *  \param ev(na)                     On output: eigenvalues of a, every processor gets the complete set
-  !c> *  \param q                          On output: Eigenvectors of a
-  !c> *                                    Distribution is like in Scalapack.
-  !c> *                                    Must be always dimensioned to the full size (corresponding to (na,na))
-  !c> *                                    even if only a part of the eigenvalues is needed.
-  !c> *  \param ldq                        Leading dimension of q
-  !c> *  \param nblk                       blocksize of cyclic distribution, must be the same in both directions!
-  !c> *  \param matrixCols                 distributed number of matrix columns
-  !c> *  \param mpi_comm_rows              MPI-Communicator for rows
-  !c> *  \param mpi_comm_cols              MPI-Communicator for columns
-  !c> *  \param mpi_coll_all               MPI communicator for the total processor set
-  !c> *  \param THIS_REAL_ELPA_KERNEL_API  specify used ELPA2 kernel via API
-  !c> *  \param use_qr                     use QR decomposition 1 = yes, 0 = no
-  !c> *
-  !c> *  \result                     int: 1 if error occured, otherwise 0
-  !c> */
-  !c> int elpa_solve_evp_real_2stage(int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_REAL_ELPA_KERNEL_API, int useQR);
-  function solve_elpa2_evp_real_wrapper(na, nev, a, lda, ev, q, ldq, nblk,    &
-                                  matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
-                                  THIS_REAL_ELPA_KERNEL_API, useQR)           &
-                                  result(success) bind(C,name="elpa_solve_evp_real_2stage")
-
-    use, intrinsic :: iso_c_binding
-    use elpa2, only : solve_evp_real_2stage
-
-    implicit none
-    integer(kind=c_int)                    :: success
-    integer(kind=c_int), value, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_cols, mpi_comm_rows, &
-                                              mpi_comm_all
-    integer(kind=c_int), value, intent(in) :: THIS_REAL_ELPA_KERNEL_API, useQR
-    real(kind=c_double)                    :: a(1:lda,1:matrixCols), ev(1:na), q(1:ldq,1:matrixCols)
-
-
-
-    logical                                :: successFortran, useQRFortran
-
-    if (useQR .eq. 0) then
-      useQRFortran =.false.
-    else
-      useQRFortran = .true.
-    endif
-
-    successFortran = solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, &
-                                           mpi_comm_cols, mpi_comm_all,                                  &
-                                           THIS_REAL_ELPA_KERNEL_API, useQRFortran)
-
-    if (successFortran) then
-      success = 1
-    else
-      success = 0
-    endif
-
-  end function
-
-
-  !c> /*! \brief C interface to solve the complex eigenvalue problem with 2-stage solver
-  !c> *
-  !c> *  \param  na                        Order of matrix a
-  !c> *  \param  nev                       Number of eigenvalues needed.
-  !c> *                                    The smallest nev eigenvalues/eigenvectors are calculated.
-  !c> *  \param  a                         Distributed matrix for which eigenvalues are to be computed.
-  !c> *                                    Distribution is like in Scalapack.
-  !c> *                                    The full matrix must be set (not only one half like in scalapack).
-  !c> *  \param lda                        Leading dimension of a
-  !c> *  \param ev(na)                     On output: eigenvalues of a, every processor gets the complete set
-  !c> *  \param q                          On output: Eigenvectors of a
-  !c> *                                    Distribution is like in Scalapack.
-  !c> *                                    Must be always dimensioned to the full size (corresponding to (na,na))
-  !c> *                                    even if only a part of the eigenvalues is needed.
-  !c> *  \param ldq                        Leading dimension of q
-  !c> *  \param nblk                       blocksize of cyclic distribution, must be the same in both directions!
-  !c> *  \param matrixCols                 distributed number of matrix columns
-  !c> *  \param mpi_comm_rows              MPI-Communicator for rows
-  !c> *  \param mpi_comm_cols              MPI-Communicator for columns
-  !c> *  \param mpi_coll_all               MPI communicator for the total processor set
-  !c> *  \param THIS_REAL_ELPA_KERNEL_API  specify used ELPA2 kernel via API
-  !c> *  \param use_qr                     use QR decomposition 1 = yes, 0 = no
-  !c> *
-  !c> *  \result                     int: 1 if error occured, otherwise 0
-  !c> */
-  !c> int elpa_solve_evp_complex_2stage(int na, int nev, double complex *a, int lda, double *ev, double complex *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_COMPLEX_ELPA_KERNEL_API);
-  function solve_elpa2_evp_complex_wrapper(na, nev, a, lda, ev, q, ldq, nblk,    &
-                                  matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all,    &
-                                  THIS_COMPLEX_ELPA_KERNEL_API)                  &
-                                  result(success) bind(C,name="elpa_solve_evp_complex_2stage")
-
-    use, intrinsic :: iso_c_binding
-    use elpa2, only : solve_evp_complex_2stage
-
-    implicit none
-    integer(kind=c_int)                    :: success
-    integer(kind=c_int), value, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_cols, mpi_comm_rows, &
-                                              mpi_comm_all
-    integer(kind=c_int), value, intent(in) :: THIS_COMPLEX_ELPA_KERNEL_API
-    complex(kind=c_double_complex)         :: a(1:lda,1:matrixCols), q(1:ldq,1:matrixCols)
-    real(kind=c_double)                    :: ev(1:na)
-    logical                                :: successFortran
-
-    successFortran = solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
-                                              mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API)
-
-    if (successFortran) then
-      success = 1
-    else
-      success = 0
-    endif
-
-  end function
-
diff --git a/src/elpa_qr/elpa_pdgeqrf.F90 b/src/elpa_qr/elpa_pdgeqrf.F90
deleted file mode 100644
index 2cc5b83bc..000000000
--- a/src/elpa_qr/elpa_pdgeqrf.F90
+++ /dev/null
@@ -1,2411 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-
-#include "config-f90.h"
-
-module elpa_pdgeqrf
-
-  use elpa1_compute
-  use elpa_pdlarfb
-  use qr_utils_mod
-
-  implicit none
-
-  PRIVATE
-
-  public :: qr_pdgeqrf_2dcomm
-  public :: qr_pqrparam_init
-  public :: qr_pdlarfg2_1dcomm_check
-
-  include 'mpif.h'
-
-  contains
-
-     subroutine qr_pdgeqrf_2dcomm(a, lda, matrixCols, v, ldv, vmrCols, tau, lengthTau, t, ldt, colsT, &
-                                  work, workLength, lwork, m, n, mb, nb, rowidx, colidx, &
-                                  rev, trans, PQRPARAM, mpicomm_rows, mpicomm_cols, blockheuristic)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! parameter setup
-      INTEGER(kind=ik), parameter   :: gmode_ = 1, rank_ = 2, eps_ = 3
-
-      ! input variables (local)
-      integer(kind=ik), intent(in)  :: lda, lwork, ldv, ldt, matrixCols, m, vmrCols, lengthTau, &
-                                       colsT, workLength
-
-      ! input variables (global)
-      integer(kind=ik)              :: n, mb, nb, rowidx, colidx, rev, trans, mpicomm_cols, mpicomm_rows
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      integer(kind=ik)              :: PQRPARAM(*)
-      real(kind=rk)                 :: a(lda,*), v(ldv,*), tau(*), t(ldt,*), work(*)
-#else
-      integer(kind=ik)              :: PQRPARAM(1:11)
-      real(kind=rk)                 :: a(1:lda,1:matrixCols), v(1:ldv,1:vmrCols), tau(1:lengthTau), &
-                                       t(1:ldt,1:colsT), work(1:workLength)
-#endif
-      ! output variables (global)
-      real(kind=rk)                 :: blockheuristic(*)
-
-      ! input variables derived from PQRPARAM
-      integer(kind=ik)              :: updatemode,tmerge,size2d
-
-      ! local scalars
-      integer(kind=ik)              :: mpierr,mpirank_cols,broadcast_size,mpirank_rows
-      integer(kind=ik)              :: mpirank_cols_qr,mpiprocs_cols
-      integer(kind=ik)              :: lcols_temp,lcols,icol,lastcol
-      integer(kind=ik)              :: baseoffset,offset,idx,voffset
-      integer(kind=ik)              :: update_voffset,update_tauoffset
-      integer(kind=ik)              :: update_lcols
-      integer(kind=ik)              :: work_offset
-
-      real(kind=rk)                 :: dbroadcast_size(1),dtmat_bcast_size(1)
-      real(kind=rk)                 :: pdgeqrf_size(1),pdlarft_size(1),pdlarfb_size(1),tmerge_pdlarfb_size(1)
-      integer(kind=ik)              :: temptau_offset,temptau_size,broadcast_offset,tmat_bcast_size
-      integer(kind=ik)              :: remaining_cols
-      integer(kind=ik)              :: total_cols
-      integer(kind=ik)              :: incremental_update_size ! needed for incremental update mode
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdgeqrf_2dcomm")
-#endif
-      size2d     = PQRPARAM(1)
-      updatemode = PQRPARAM(2)
-      tmerge     = PQRPARAM(3)
-
-      ! copy value before we are going to filter it
-      total_cols = n
-
-      call mpi_comm_rank(mpicomm_cols,mpirank_cols,mpierr)
-      call mpi_comm_rank(mpicomm_rows,mpirank_rows,mpierr)
-      call mpi_comm_size(mpicomm_cols,mpiprocs_cols,mpierr)
-
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      call qr_pdgeqrf_1dcomm(a,lda,v,ldv,tau,t,ldt,pdgeqrf_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,trans, &
-                             PQRPARAM(4),mpicomm_rows,blockheuristic)
-#else
-      call qr_pdgeqrf_1dcomm(a,lda,v,ldv,tau,t,ldt,pdgeqrf_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,trans, &
-                             PQRPARAM(4:11),mpicomm_rows,blockheuristic)
-#endif
-      call qr_pdgeqrf_pack_unpack(v,ldv,dbroadcast_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,0,mpicomm_rows)
-      call qr_pdgeqrf_pack_unpack_tmatrix(tau,t,ldt,dtmat_bcast_size(1),-1,total_cols,0)
-
-      pdlarft_size(1) = 0.0d0
-
-      call qr_pdlarfb_1dcomm(m,mb,total_cols,total_cols,a,lda,v,ldv,tau,t,ldt,rowidx,rowidx,rev,mpicomm_rows, &
-                             pdlarfb_size(1),-1)
-      call qr_tmerge_pdlarfb_1dcomm(m,mb,total_cols,total_cols,total_cols,v,ldv,t,ldt,a,lda,rowidx,rev,updatemode, &
-                                    mpicomm_rows,tmerge_pdlarfb_size(1),-1)
-
-
-      temptau_offset = 1
-      temptau_size = total_cols
-      broadcast_offset = temptau_offset + temptau_size
-      broadcast_size = dbroadcast_size(1) + dtmat_bcast_size(1)
-      work_offset = broadcast_offset + broadcast_size
-
-      if (lwork .eq. -1) then
-        work(1) = (DBLE(temptau_size) + DBLE(broadcast_size) + max(pdgeqrf_size(1),pdlarft_size(1),pdlarfb_size(1), &
-                   tmerge_pdlarfb_size(1)))
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdgeqrf_2dcomm")
-#endif
-        return
-      end if
-
-      lastcol = colidx-total_cols+1
-      voffset = total_cols
-
-      incremental_update_size = 0
-
-      ! clear v buffer: just ensure that there is no junk in the upper triangle
-      ! part, otherwise pdlarfb gets some problems
-      ! pdlarfl(2) do not have these problems as they are working more on a vector
-      ! basis
-      v(1:ldv,1:total_cols) = 0.0d0
-
-      icol = colidx
-
-      remaining_cols = total_cols
-
-      !print *,'start decomposition',m,rowidx,colidx
-
-      do while (remaining_cols .gt. 0)
-
-        ! determine rank of process column with next qr block
-        mpirank_cols_qr = MOD((icol-1)/nb,mpiprocs_cols)
-
-        ! lcols can't be larger than than nb
-        ! exception: there is only one process column
-
-        ! however, we might not start at the first local column.
-        ! therefore assume a matrix of size (1xlcols) starting at (1,icol)
-        ! determine the real amount of local columns
-        lcols_temp = min(nb,(icol-lastcol+1))
-
-        ! blocking parameter
-        lcols_temp = max(min(lcols_temp,size2d),1)
-
-        ! determine size from last decomposition column
-        !  to first decomposition column
-        call local_size_offset_1d(icol,nb,icol-lcols_temp+1,icol-lcols_temp+1,0, &
-                                      mpirank_cols_qr,mpiprocs_cols, &
-                                      lcols,baseoffset,offset)
-
-        voffset = remaining_cols - lcols + 1
-
-        idx = rowidx - colidx + icol
-
-        if (mpirank_cols .eq. mpirank_cols_qr) then
-          ! qr decomposition part
-
-          tau(offset:offset+lcols-1) = 0.0d0
-
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-          call qr_pdgeqrf_1dcomm(a(1,offset),lda,v(1,voffset),ldv,tau(offset),t(voffset,voffset),ldt, &
-                                 work(work_offset),lwork,m,lcols,mb,rowidx,idx,rev,trans,PQRPARAM(4), &
-                                 mpicomm_rows,blockheuristic)
-
-#else
-          call qr_pdgeqrf_1dcomm(a(1,offset),lda,v(1,voffset),ldv,tau(offset),t(voffset,voffset),ldt, &
-                                 work(work_offset),lwork,m,lcols,mb,rowidx,idx,rev,trans,PQRPARAM(4:11), &
-                                 mpicomm_rows,blockheuristic)
-#endif
-
-          ! pack broadcast buffer (v + tau)
-          call qr_pdgeqrf_pack_unpack(v(1,voffset),ldv,work(broadcast_offset),lwork,m,lcols,mb,rowidx,&
-                                      idx,rev,0,mpicomm_rows)
-
-          ! determine broadcast size
-          call qr_pdgeqrf_pack_unpack(v(1,voffset),ldv,dbroadcast_size(1),-1,m,lcols,mb,rowidx,idx,rev,&
-                                      0,mpicomm_rows)
-          broadcast_size = dbroadcast_size(1)
-
-          !if (mpirank_rows .eq. 0) then
-          ! pack tmatrix into broadcast buffer and calculate new size
-          call qr_pdgeqrf_pack_unpack_tmatrix(tau(offset),t(voffset,voffset),ldt, &
-                                              work(broadcast_offset+broadcast_size),lwork,lcols,0)
-          call qr_pdgeqrf_pack_unpack_tmatrix(tau(offset),t(voffset,voffset),ldt,dtmat_bcast_size(1),-1,lcols,0)
-          broadcast_size = broadcast_size + dtmat_bcast_size(1)
-          !end if
-
-          ! initiate broadcast (send part)
-          call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real8, &
-                         mpirank_cols_qr,mpicomm_cols,mpierr)
-
-          ! copy tau parts into temporary tau buffer
-          work(temptau_offset+voffset-1:temptau_offset+(voffset-1)+lcols-1) = tau(offset:offset+lcols-1)
-
-          !print *,'generated tau:', tau(offset)
-        else
-          ! vector exchange part
-
-          ! determine broadcast size
-          call qr_pdgeqrf_pack_unpack(v(1,voffset),ldv,dbroadcast_size(1),-1,m,lcols,mb,rowidx,idx,rev,1,mpicomm_rows)
-          broadcast_size = dbroadcast_size(1)
-
-          call qr_pdgeqrf_pack_unpack_tmatrix(work(temptau_offset+voffset-1),t(voffset,voffset),ldt, &
-                                              dtmat_bcast_size(1),-1,lcols,0)
-          tmat_bcast_size = dtmat_bcast_size(1)
-
-          !print *,'broadcast_size (nonqr)',broadcast_size
-          broadcast_size = dbroadcast_size(1) + dtmat_bcast_size(1)
-
-          ! initiate broadcast (recv part)
-          call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real8, &
-                         mpirank_cols_qr,mpicomm_cols,mpierr)
-
-          ! last n*n elements in buffer are (still empty) T matrix elements
-          ! fetch from first process in each column
-
-          ! unpack broadcast buffer (v + tau)
-          call qr_pdgeqrf_pack_unpack(v(1,voffset),ldv,work(broadcast_offset),lwork,m,lcols,mb,rowidx,idx,rev,1,mpicomm_rows)
-
-          ! now send t matrix to other processes in our process column
-          broadcast_size = dbroadcast_size(1)
-          tmat_bcast_size = dtmat_bcast_size(1)
-
-          ! t matrix should now be available on all processes => unpack
-          call qr_pdgeqrf_pack_unpack_tmatrix(work(temptau_offset+voffset-1),t(voffset,voffset),ldt, &
-                                              work(broadcast_offset+broadcast_size),lwork,lcols,1)
-        end if
-
-        remaining_cols = remaining_cols - lcols
-
-        ! apply householder vectors to whole trailing matrix parts (if any)
-
-        update_voffset = voffset
-        update_tauoffset = icol
-        update_lcols = lcols
-        incremental_update_size = incremental_update_size + lcols
-
-        icol = icol - lcols
-        ! count colums from first column of global block to current index
-        call local_size_offset_1d(icol,nb,colidx-n+1,colidx-n+1,0, &
-                                      mpirank_cols,mpiprocs_cols, &
-                                      lcols,baseoffset,offset)
-
-        if (lcols .gt. 0) then
-
-          !print *,'updating trailing matrix'
-
-       	  if (updatemode .eq. ichar('I')) then
-       	    print *,'pdgeqrf_2dcomm: incremental update not yet implemented! rev=1'
-       	  else if (updatemode .eq. ichar('F')) then
-       	    ! full update no merging
-       	    call qr_pdlarfb_1dcomm(m,mb,lcols,update_lcols,a(1,offset),lda,v(1,update_voffset),ldv, &
-         						work(temptau_offset+update_voffset-1),                          &
-                                                        t(update_voffset,update_voffset),ldt, &
-        						rowidx,idx,1,mpicomm_rows,work(work_offset),lwork)
-       	  else
-            ! full update + merging default
-       	    call qr_tmerge_pdlarfb_1dcomm(m,mb,lcols,n-(update_voffset+update_lcols-1),update_lcols, &
-                                                              v(1,update_voffset),ldv, &
-        						      t(update_voffset,update_voffset),ldt, &
-        						      a(1,offset),lda,rowidx,1,updatemode,mpicomm_rows, &
-                                                              work(work_offset),lwork)
-       	  end if
-        else
-       	  if (updatemode .eq. ichar('I')) then
-       	    print *,'sole merging of (incremental) T matrix', mpirank_cols,  &
-                                        n-(update_voffset+incremental_update_size-1)
-       	    call qr_tmerge_pdlarfb_1dcomm(m,mb,0,n-(update_voffset+incremental_update_size-1),   &
-                                                              incremental_update_size,v(1,update_voffset),ldv, &
-        						      t(update_voffset,update_voffset),ldt, &
-        						      a,lda,rowidx,1,updatemode,mpicomm_rows,work(work_offset),lwork)
-
-       	    ! reset for upcoming incremental updates
-       	    incremental_update_size = 0
-          else if (updatemode .eq. ichar('M')) then
-       	    ! final merge
-            call qr_tmerge_pdlarfb_1dcomm(m,mb,0,n-(update_voffset+update_lcols-1),update_lcols, &
-                                                              v(1,update_voffset),ldv, &
-        						      t(update_voffset,update_voffset),ldt, &
-        						      a,lda,rowidx,1,updatemode,mpicomm_rows,work(work_offset),lwork)
-          else
-            ! full updatemode - nothing to update
-          end if
-
-          ! reset for upcoming incremental updates
-          incremental_update_size = 0
-        end if
-      end do
-
-      if ((tmerge .gt. 0) .and. (updatemode .eq. ichar('F'))) then
-        ! finally merge all small T parts
-        call qr_pdlarft_tree_merge_1dcomm(m,mb,n,size2d,tmerge,v,ldv,t,ldt,rowidx,rev,mpicomm_rows,work,lwork)
-      end if
-
-      !print *,'stop decomposition',rowidx,colidx
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdgeqrf_2dcomm")
-#endif
-    end subroutine qr_pdgeqrf_2dcomm
-
-    subroutine qr_pdgeqrf_1dcomm(a,lda,v,ldv,tau,t,ldt,work,lwork,m,n,mb,baseidx,rowidx,rev,trans,PQRPARAM,mpicomm,blockheuristic)
-      use precision
-      use ELPA1
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! parameter setup
-      INTEGER(kind=ik), parameter  :: gmode_ = 1,rank_ = 2,eps_ = 3
-
-      ! input variables (local)
-      integer(kind=ik)             :: lda,lwork,ldv,ldt
-      real(kind=rk)                :: a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(*)
-
-      ! input variables (global)
-      integer(kind=ik)             :: m,n,mb,baseidx,rowidx,rev,trans,mpicomm
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      integer(kind=ik)             :: PQRPARAM(*)
-
-#else
-      integer(kind=ik)             :: PQRPARAM(:)
-#endif
-      ! derived input variables
-
-      ! derived further input variables from QR_PQRPARAM
-      integer(kind=ik)             :: size1d,updatemode,tmerge
-
-      ! output variables (global)
-      real(kind=rk)                :: blockheuristic(*)
-
-      ! local scalars
-      integer(kind=ik)             :: nr_blocks,remainder,current_block,aoffset,idx,updatesize
-      real(kind=rk)                :: pdgeqr2_size(1),pdlarfb_size(1),tmerge_tree_size(1)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdgeqrf_1dcomm")
-#endif
-      size1d     = max(min(PQRPARAM(1),n),1)
-      updatemode = PQRPARAM(2)
-      tmerge     = PQRPARAM(3)
-
-      if (lwork .eq. -1) then
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-        call qr_pdgeqr2_1dcomm(a,lda,v,ldv,tau,t,ldt,pdgeqr2_size,-1, &
-                                  m,size1d,mb,baseidx,baseidx,rev,trans,PQRPARAM(4),mpicomm,blockheuristic)
-#else
-        call qr_pdgeqr2_1dcomm(a,lda,v,ldv,tau,t,ldt,pdgeqr2_size,-1, &
-                                  m,size1d,mb,baseidx,baseidx,rev,trans,PQRPARAM(4:),mpicomm,blockheuristic)
-#endif
-        ! reserve more space for incremental mode
-        call qr_tmerge_pdlarfb_1dcomm(m,mb,n,n,n,v,ldv,t,ldt, &
-                                         a,lda,baseidx,rev,updatemode,mpicomm,pdlarfb_size,-1)
-
-        call qr_pdlarft_tree_merge_1dcomm(m,mb,n,size1d,tmerge,v,ldv,t,ldt,baseidx,rev,mpicomm,tmerge_tree_size,-1)
-
-        work(1) = max(pdlarfb_size(1),pdgeqr2_size(1),tmerge_tree_size(1))
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdgeqrf_1dcomm")
-#endif
-        return
-      end if
-
-      nr_blocks = n / size1d
-      remainder = n - nr_blocks*size1d
-
-      current_block = 0
-      do while (current_block .lt. nr_blocks)
-        idx = rowidx-current_block*size1d
-        updatesize = n-(current_block+1)*size1d
-        aoffset = 1+updatesize
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-        call qr_pdgeqr2_1dcomm(a(1,aoffset),lda,v(1,aoffset),ldv,tau(aoffset),t(aoffset,aoffset),ldt,work,lwork, &
-                                m,size1d,mb,baseidx,idx,1,trans,PQRPARAM(4),mpicomm,blockheuristic)
-
-#else
-        call qr_pdgeqr2_1dcomm(a(1,aoffset),lda,v(1,aoffset),ldv,tau(aoffset),t(aoffset,aoffset),ldt,work,lwork, &
-                                m,size1d,mb,baseidx,idx,1,trans,PQRPARAM(4:),mpicomm,blockheuristic)
-#endif
-        if (updatemode .eq. ichar('M')) then
-          ! full update + merging
-          call qr_tmerge_pdlarfb_1dcomm(m,mb,updatesize,current_block*size1d,size1d, &
-                                           v(1,aoffset),ldv,t(aoffset,aoffset),ldt, &
-                                           a,lda,baseidx,1,ichar('F'),mpicomm,work,lwork)
-        else if (updatemode .eq. ichar('I')) then
-          if (updatesize .ge. size1d) then
-            ! incremental update + merging
-            call qr_tmerge_pdlarfb_1dcomm(m,mb,size1d,current_block*size1d,size1d, &
-                                               v(1,aoffset),ldv,t(aoffset,aoffset),ldt, &
-                                               a(1,aoffset-size1d),lda,baseidx,1,updatemode,mpicomm,work,lwork)
-
-          else ! only remainder left
-             ! incremental update + merging
-             call qr_tmerge_pdlarfb_1dcomm(m,mb,remainder,current_block*size1d,size1d, &
-                                               v(1,aoffset),ldv,t(aoffset,aoffset),ldt, &
-                                               a(1,1),lda,baseidx,1,updatemode,mpicomm,work,lwork)
-          end if
-        else ! full update no merging is default
-          ! full update no merging
-          call qr_pdlarfb_1dcomm(m,mb,updatesize,size1d,a,lda,v(1,aoffset),ldv, &
-                                    tau(aoffset),t(aoffset,aoffset),ldt,baseidx,idx,1,mpicomm,work,lwork)
-        end if
-
-        ! move on to next block
-        current_block = current_block+1
-      end do
-
-      if (remainder .gt. 0) then
-        aoffset = 1
-        idx = rowidx-size1d*nr_blocks
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-        call qr_pdgeqr2_1dcomm(a(1,aoffset),lda,v,ldv,tau,t,ldt,work,lwork, &
-                                  m,remainder,mb,baseidx,idx,1,trans,PQRPARAM(4),mpicomm,blockheuristic)
-
-#else
-        call qr_pdgeqr2_1dcomm(a(1,aoffset),lda,v,ldv,tau,t,ldt,work,lwork, &
-                                  m,remainder,mb,baseidx,idx,1,trans,PQRPARAM(4:),mpicomm,blockheuristic)
-#endif
-        if ((updatemode .eq. ichar('I')) .or. (updatemode .eq. ichar('M'))) then
-          ! final merging
-          call qr_tmerge_pdlarfb_1dcomm(m,mb,0,size1d*nr_blocks,remainder, &
-                                             v,ldv,t,ldt, &
-                                             a,lda,baseidx,1,updatemode,mpicomm,work,lwork) ! updatemode argument does not matter
-        end if
-      end if
-
-      if ((tmerge .gt. 0) .and. (updatemode .eq. ichar('F'))) then
-        ! finally merge all small T parts
-        call qr_pdlarft_tree_merge_1dcomm(m,mb,n,size1d,tmerge,v,ldv,t,ldt,baseidx,rev,mpicomm,work,lwork)
-      end if
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdgeqrf_1dcomm")
-#endif
-
-    end subroutine qr_pdgeqrf_1dcomm
-
-    ! local a and tau are assumed to be positioned at the right column from a local
-    ! perspective
-    ! TODO: if local amount of data turns to zero the algorithm might produce wrong
-    ! results (probably due to old buffer contents)
-    subroutine qr_pdgeqr2_1dcomm(a,lda,v,ldv,tau,t,ldt,work,lwork,m,n,mb,baseidx,rowidx,rev,trans,PQRPARAM,mpicomm,blockheuristic)
-      use precision
-      use ELPA1
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! parameter setup
-      INTEGER(kind=ik), parameter   :: gmode_ = 1,rank_ = 2 ,eps_ = 3, upmode1_ = 4
-
-      ! input variables (local)
-      integer(kind=ik)              :: lda,lwork,ldv,ldt
-      real(kind=rk)                 :: a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(*)
-
-      ! input variables (global)
-      integer(kind=ik)              :: m,n,mb,baseidx,rowidx,rev,trans,mpicomm
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      integer(kind=ik)              :: PQRPARAM(*)
-#else
-      integer(kind=ik)              :: PQRPARAM(:)
-#endif
-      ! output variables (global)
-      real(kind=rk)                 :: blockheuristic(*)
-
-      ! derived further input variables from QR_PQRPARAM
-      integer(kind=ik)              ::  maxrank,hgmode,updatemode
-
-      ! local scalars
-      integer(kind=ik)              :: icol,incx,idx
-      real(kind=rk)                 :: pdlarfg_size(1),pdlarf_size(1),total_size
-      real(kind=rk)                 :: pdlarfg2_size(1),pdlarfgk_size(1),pdlarfl2_size(1)
-      real(kind=rk)                 :: pdlarft_size(1),pdlarfb_size(1),pdlarft_pdlarfb_size(1),tmerge_pdlarfb_size(1)
-      integer(kind=ik)              :: mpirank,mpiprocs,mpierr
-      integer(kind=ik)              :: rank,lastcol,actualrank,nextrank
-      integer(kind=ik)              :: update_cols,decomposition_cols
-      integer(kind=ik)              :: current_column
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdgeqr2_1dcomm")
-#endif
-
-      maxrank    = min(PQRPARAM(1),n)
-      updatemode = PQRPARAM(2)
-      hgmode     = PQRPARAM(4)
-
-      call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-      call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-
-      if (trans .eq. 1) then
-        incx = lda
-      else
-        incx = 1
-      end if
-
-      if (lwork .eq. -1) then
-        call qr_pdlarfg_1dcomm(a,incx,tau(1),pdlarfg_size(1),-1,n,rowidx,mb,hgmode,rev,mpicomm)
-        call qr_pdlarfl_1dcomm(v,1,baseidx,a,lda,tau(1),pdlarf_size(1),-1,m,n,rowidx,mb,rev,mpicomm)
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-        call qr_pdlarfg2_1dcomm_ref(a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfg2_size(1),-1,m,rowidx,mb,PQRPARAM, &
-                                    rev,mpicomm,actualrank)
-
-        call qr_pdlarfgk_1dcomm(a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfgk_size(1),-1,m,n,rowidx,mb,PQRPARAM,rev,mpicomm,actualrank)
-
-#else
-        call qr_pdlarfg2_1dcomm_ref(a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfg2_size(1),-1,m,rowidx,mb,PQRPARAM(:), &
-                                    rev,mpicomm,actualrank)
-
-        call qr_pdlarfgk_1dcomm(a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfgk_size(1),-1,m,n,rowidx,mb,PQRPARAM(:),rev,mpicomm,actualrank)
-#endif
-        call qr_pdlarfl2_tmatrix_1dcomm(v,ldv,baseidx,a,lda,t,ldt,pdlarfl2_size(1),-1,m,n,rowidx,mb,rev,mpicomm)
-        pdlarft_size(1) = 0.0d0
-        call qr_pdlarfb_1dcomm(m,mb,n,n,a,lda,v,ldv,tau,t,ldt,baseidx,rowidx,1,mpicomm,pdlarfb_size(1),-1)
-        pdlarft_pdlarfb_size(1) = 0.0d0
-        call qr_tmerge_pdlarfb_1dcomm(m,mb,n,n,n,v,ldv,t,ldt,a,lda,rowidx,rev,updatemode,mpicomm,tmerge_pdlarfb_size(1),-1)
-
-        total_size = max(pdlarfg_size(1),pdlarf_size(1),pdlarfg2_size(1),pdlarfgk_size(1),pdlarfl2_size(1),pdlarft_size(1), &
-                         pdlarfb_size(1),pdlarft_pdlarfb_size(1),tmerge_pdlarfb_size(1))
-
-        work(1) = total_size
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdgeqr2_1dcomm")
-#endif
-        return
-      end if
-
-      icol = 1
-      lastcol = min(rowidx,n)
-      decomposition_cols = lastcol
-      update_cols = n
-      do while (decomposition_cols .gt. 0) ! local qr block
-        icol = lastcol-decomposition_cols+1
-        idx = rowidx-icol+1
-
-        ! get possible rank size
-        ! limited by number of columns and remaining rows
-        rank = min(n-icol+1,maxrank,idx)
-
-        current_column = n-icol+1-rank+1
-
-        if (rank .eq. 1) then
-
-          call qr_pdlarfg_1dcomm(a(1,current_column),incx, &
-                                  tau(current_column),work,lwork, &
-                                  m,idx,mb,hgmode,1,mpicomm)
-
-          v(1:ldv,current_column) = 0.0d0
-          call qr_pdlarfg_copy_1dcomm(a(1,current_column),incx, &
-                                       v(1,current_column),1, &
-                                       m,baseidx,idx,mb,1,mpicomm)
-
-          ! initialize t matrix part
-          t(current_column,current_column) = tau(current_column)
-
-          actualrank = 1
-
-        else if (rank .eq. 2) then
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-          call qr_pdlarfg2_1dcomm_ref(a(1,current_column),lda,tau(current_column), &
-                                         t(current_column,current_column),ldt,v(1,current_column),ldv, &
-                                        baseidx,work,lwork,m,idx,mb,PQRPARAM,1,mpicomm,actualrank)
-
-#else
-          call qr_pdlarfg2_1dcomm_ref(a(1,current_column),lda,tau(current_column), &
-                                         t(current_column,current_column),ldt,v(1,current_column),ldv, &
-                                        baseidx,work,lwork,m,idx,mb,PQRPARAM(:),1,mpicomm,actualrank)
-#endif
-        else
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-          call qr_pdlarfgk_1dcomm(a(1,current_column),lda,tau(current_column), &
-                                     t(current_column,current_column),ldt,v(1,current_column),ldv, &
-                                     baseidx,work,lwork,m,rank,idx,mb,PQRPARAM,1,mpicomm,actualrank)
-
-#else
-          call qr_pdlarfgk_1dcomm(a(1,current_column),lda,tau(current_column), &
-                                     t(current_column,current_column),ldt,v(1,current_column),ldv, &
-                                     baseidx,work,lwork,m,rank,idx,mb,PQRPARAM(:),1,mpicomm,actualrank)
-#endif
-        end if
-
-        blockheuristic(actualrank) = blockheuristic(actualrank) + 1
-
-        ! the blocked decomposition versions already updated their non
-        ! decomposed parts using their information after communication
-        update_cols = decomposition_cols - rank
-        decomposition_cols = decomposition_cols - actualrank
-
-        ! needed for incremental update
-        nextrank = min(n-(lastcol-decomposition_cols+1)+1,maxrank,rowidx-(lastcol-decomposition_cols+1)+1)
-
-        if (current_column .gt. 1) then
-          idx = rowidx-icol+1
-
-          if (updatemode .eq. ichar('I')) then
-            ! incremental update + merging
-            call qr_tmerge_pdlarfb_1dcomm(m,mb,nextrank-(rank-actualrank),n-(current_column+rank-1),actualrank, &
-                                          v(1,current_column+(rank-actualrank)),ldv, &
-                                          t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
-                                          a(1,current_column-nextrank+(rank-actualrank)),lda,baseidx,rev,updatemode,&
-                                          mpicomm,work,lwork)
-          else
-            ! full update + merging
-            call qr_tmerge_pdlarfb_1dcomm(m,mb,update_cols,n-(current_column+rank-1),actualrank, &
-                                          v(1,current_column+(rank-actualrank)),ldv, &
-                                          t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
-                                          a(1,1),lda,baseidx,rev,updatemode,mpicomm,work,lwork)
-          end if
-        else
-          call qr_tmerge_pdlarfb_1dcomm(m,mb,0,n-(current_column+rank-1),actualrank,v(1,current_column+(rank-actualrank)), &
-                                          ldv, &
-                                          t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
-                                           a,lda,baseidx,rev,updatemode,mpicomm,work,lwork)
-        end if
-
-      end do
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdgeqr2_1dcomm")
-#endif
-    end subroutine qr_pdgeqr2_1dcomm
-
-    ! incx == 1: column major
-    ! incx != 1: row major
-    subroutine qr_pdlarfg_1dcomm(x,incx,tau,work,lwork,n,idx,nb,hgmode,rev,mpi_comm)
-
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! parameter setup
-      INTEGER(kind=ik), parameter    :: gmode_ = 1,rank_ = 2, eps_ = 3
-
-      ! input variables (local)
-      integer(kind=ik)               :: incx,lwork,hgmode
-      real(kind=rk)                  :: x(*),work(*)
-
-      ! input variables (global)
-      integer(kind=ik)               :: mpi_comm,nb,idx,n,rev
-
-      ! output variables (global)
-      real(kind=rk)                  :: tau
-
-      ! local scalars
-      integer(kind=ik)               :: mpierr,mpirank,mpiprocs,mpirank_top
-      integer(kind=ik)               :: sendsize,recvsize
-      integer(kind=ik)               :: local_size,local_offset,baseoffset
-      integer(kind=ik)               :: topidx,top,iproc
-      real(kind=rk)                  :: alpha,xnorm,dot,xf
-
-      ! external functions
-      real(kind=rk), external        :: ddot,dlapy2,dnrm2
-      external                       :: dscal
-
-      ! intrinsic
-!      intrinsic sign
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfg_1dcomm")
-#endif
-      if (idx .le. 1) then
-        tau = 0.0d0
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg_1dcomm")
-#endif
-        return
-       end if
-
-      call MPI_Comm_rank(mpi_comm, mpirank, mpierr)
-      call MPI_Comm_size(mpi_comm, mpiprocs, mpierr)
-
-      ! calculate expected work size and store in work(1)
-      if (hgmode .eq. ichar('s')) then
-        ! allreduce (MPI_SUM)
-        sendsize = 2
-        recvsize = sendsize
-      else if (hgmode .eq. ichar('x')) then
-        ! alltoall
-        sendsize = mpiprocs*2
-        recvsize = sendsize
-      else if (hgmode .eq. ichar('g')) then
-        ! allgather
-        sendsize = 2
-        recvsize = mpiprocs*sendsize
-      else
-        ! no exchange at all (benchmarking)
-        sendsize = 2
-        recvsize = sendsize
-      end if
-
-      if (lwork .eq. -1) then
-        work(1) = DBLE(sendsize + recvsize)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg_1dcomm")
-#endif
-        return
-      end if
-
-      ! Processor id for global index of top element
-      mpirank_top = MOD((idx-1)/nb,mpiprocs)
-      if (mpirank .eq. mpirank_top) then
-        topidx = local_index(idx,mpirank_top,mpiprocs,nb,0)
-        top = 1+(topidx-1)*incx
-      end if
-
-      call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
-          						  local_size,baseoffset,local_offset)
-
-      local_offset = local_offset * incx
-
-      ! calculate and exchange information
-      if (hgmode .eq. ichar('s')) then
-        if (mpirank .eq. mpirank_top) then
-          alpha = x(top)
-        else
-          alpha = 0.0d0
-        end if
-
-        dot = ddot(local_size, &
-                     x(local_offset), incx, &
-                     x(local_offset), incx)
-
-        work(1) = alpha
-        work(2) = dot
-
-        call mpi_allreduce(work(1),work(sendsize+1), &
-                             sendsize,mpi_real8,mpi_sum, &
-                             mpi_comm,mpierr)
-
-        alpha = work(sendsize+1)
-        xnorm = sqrt(work(sendsize+2))
-      else if (hgmode .eq. ichar('x')) then
-        if (mpirank .eq. mpirank_top) then
-          alpha = x(top)
-        else
-          alpha = 0.0d0
-        end if
-
-        xnorm = dnrm2(local_size, x(local_offset), incx)
-
-        do iproc=0,mpiprocs-1
-          work(2*iproc+1) = alpha
-          work(2*iproc+2) = xnorm
-        end do
-
-        call mpi_alltoall(work(1),2,mpi_real8, &
-                            work(sendsize+1),2,mpi_real8, &
-                            mpi_comm,mpierr)
-
-        ! extract alpha value
-        alpha = work(sendsize+1+mpirank_top*2)
-
-        ! copy norm parts of buffer to beginning
-        do iproc=0,mpiprocs-1
-          work(iproc+1) = work(sendsize+1+2*iproc+1)
-        end do
-
-        xnorm = dnrm2(mpiprocs, work(1), 1)
-      else if (hgmode .eq. ichar('g')) then
-        if (mpirank .eq. mpirank_top) then
-          alpha = x(top)
-        else
-          alpha = 0.0d0
-        end if
-
-        xnorm = dnrm2(local_size, x(local_offset), incx)
-        work(1) = alpha
-        work(2) = xnorm
-
-        ! allgather
-        call mpi_allgather(work(1),sendsize,mpi_real8, &
-                            work(sendsize+1),sendsize,mpi_real8, &
-                            mpi_comm,mpierr)
-
-        ! extract alpha value
-        alpha = work(sendsize+1+mpirank_top*2)
-
-        ! copy norm parts of buffer to beginning
-        do iproc=0,mpiprocs-1
-          work(iproc+1) = work(sendsize+1+2*iproc+1)
-        end do
-
-        xnorm = dnrm2(mpiprocs, work(1), 1)
-      else
-        ! dnrm2
-        xnorm = dnrm2(local_size, x(local_offset), incx)
-
-        if (mpirank .eq. mpirank_top) then
-          alpha = x(top)
-        else
-          alpha = 0.0d0
-        end if
-
-        ! no exchange at all (benchmarking)
-
-        xnorm = 0.0d0
-      end if
-
-      !print *,'ref hg:', idx,xnorm,alpha
-      !print *,x(1:n)
-
-      ! calculate householder information
-      if (xnorm .eq. 0.0d0) then
-        ! H = I
-
-        tau = 0.0d0
-      else
-        ! General case
-
-        call hh_transform_real(alpha,xnorm**2,xf,tau)
-        if (mpirank .eq. mpirank_top) then
-          x(top) = alpha
-        end if
-
-        call dscal(local_size, xf, &
-                     x(local_offset), incx)
-
-        ! TODO: reimplement norm rescale method of
-        ! original PDLARFG using mpi?
-
-      end if
-
-      ! useful for debugging
-      !print *,'hg:mpirank,idx,beta,alpha:',mpirank,idx,beta,alpha,1.0d0/(beta+alpha),tau
-      !print *,x(1:n)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg_1dcomm")
-#endif
-    end subroutine qr_pdlarfg_1dcomm
-
-    subroutine qr_pdlarfg2_1dcomm_ref(a,lda,tau,t,ldt,v,ldv,baseidx,work,lwork,m,idx,mb,PQRPARAM,rev,mpicomm,actualk)
-      use precision
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! parameter setup
-      INTEGER(kind=ik), parameter    :: gmode_ = 1,rank_ = 2,eps_ = 3, upmode1_ = 4
-      ! input variables (local)
-      integer(kind=ik)               :: lda,lwork,ldv,ldt
-      real(kind=rk)                  :: a(lda,*),v(ldv,*),tau(*),work(*),t(ldt,*)
-
-      ! input variables (global)
-      integer(kind=ik)               :: m,idx,baseidx,mb,rev,mpicomm
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      integer(kind=ik)               :: PQRPARAM(*)
-#else
-      integer(kind=ik)               :: PQRPARAM(:)
-#endif
-      ! output variables (global)
-      integer(kind=ik)               :: actualk
-
-      ! derived input variables from QR_PQRPARAM
-      integer(kind=ik)               :: eps
-
-      ! local scalars
-      real(kind=rk)                  :: dseedwork_size(1)
-      integer(kind=ik)               :: seedwork_size,seed_size
-      integer(kind=ik)               :: seedwork_offset,seed_offset
-      logical                        :: accurate
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfg2_1dcomm")
-#endif
-
-      call qr_pdlarfg2_1dcomm_seed(a,lda,dseedwork_size(1),-1,work,m,mb,idx,rev,mpicomm)
-      seedwork_size = dseedwork_size(1)
-      seed_size = seedwork_size
-
-      if (lwork .eq. -1) then
-        work(1) = seedwork_size + seed_size
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm")
-#endif
-
-        return
-      end if
-
-      seedwork_offset = 1
-      seed_offset = seedwork_offset + seedwork_size
-
-      eps = PQRPARAM(3)
-
-      ! check for border cases (only a 2x2 matrix left)
-      if (idx .le. 1) then
-      	tau(1:2) = 0.0d0
-       	t(1:2,1:2) = 0.0d0
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm")
-#endif
-
-       	return
-      end if
-
-      call qr_pdlarfg2_1dcomm_seed(a,lda,work(seedwork_offset),lwork,work(seed_offset),m,mb,idx,rev,mpicomm)
-
-      if (eps .gt. 0) then
-        accurate = qr_pdlarfg2_1dcomm_check(work(seed_offset),eps)
-      else
-        accurate = .true.
-      end if
-
-      call qr_pdlarfg2_1dcomm_vector(a(1,2),1,tau(2),work(seed_offset), &
-                                          m,mb,idx,0,1,mpicomm)
-
-      call qr_pdlarfg_copy_1dcomm(a(1,2),1, &
-                                       v(1,2),1, &
-                                       m,baseidx,idx,mb,1,mpicomm)
-
-      call qr_pdlarfg2_1dcomm_update(v(1,2),1,baseidx,a(1,1),lda,work(seed_offset),m,idx,mb,rev,mpicomm)
-
-      ! check for 2x2 matrix case => only one householder vector will be
-      ! generated
-      if (idx .gt. 2) then
-        if (accurate .eqv. .true.) then
-          call qr_pdlarfg2_1dcomm_vector(a(1,1),1,tau(1),work(seed_offset), &
-                                                  m,mb,idx-1,1,1,mpicomm)
-
-          call qr_pdlarfg_copy_1dcomm(a(1,1),1, &
-                                               v(1,1),1, &
-                                               m,baseidx,idx-1,mb,1,mpicomm)
-
-          ! generate fuse element
-          call qr_pdlarfg2_1dcomm_finalize_tmatrix(work(seed_offset),tau,t,ldt)
-
-          actualk = 2
-        else
-          t(1,1) = 0.0d0
-          t(1,2) = 0.0d0
-          t(2,2) = tau(2)
-
-          actualk = 1
-        end if
-      else
-        t(1,1) = 0.0d0
-        t(1,2) = 0.0d0
-        t(2,2) = tau(2)
-
-        ! no more vectors to create
-
-        tau(1) = 0.0d0
-
-        actualk = 2
-
-        !print *,'rank2: no more data'
-      end if
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm")
-#endif
-
-    end subroutine qr_pdlarfg2_1dcomm_ref
-
-    subroutine qr_pdlarfg2_1dcomm_seed(a,lda,work,lwork,seed,n,nb,idx,rev,mpicomm)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input variables (local)
-      integer(kind=ik)        :: lda,lwork
-      real(kind=rk)           :: a(lda,*),work(*),seed(*)
-
-      ! input variables (global)
-      integer(kind=ik)        :: n,nb,idx,rev,mpicomm
-
-      ! output variables (global)
-
-      ! external functions
-      real(kind=rk), external :: ddot
-
-      ! local scalars
-      real(kind=rk)           :: top11,top21,top12,top22
-      real(kind=rk)           :: dot11,dot12,dot22
-      integer(kind=ik)        :: mpirank,mpiprocs,mpierr
-      integer(kind=ik)        :: mpirank_top11,mpirank_top21
-      integer(kind=ik)        :: top11_offset,top21_offset
-      integer(kind=ik)        :: baseoffset
-      integer(kind=ik)        :: local_offset1,local_size1
-      integer(kind=ik)        :: local_offset2,local_size2
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfg2_1dcomm_seed")
-#endif
-
-      if (lwork .eq. -1) then
-        work(1) = DBLE(8)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm_seed")
-#endif
-        return
-      end if
-
-      call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-      call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-
-      call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
-                                local_size1,baseoffset,local_offset1)
-
-      call local_size_offset_1d(n,nb,idx,idx-2,rev,mpirank,mpiprocs, &
-                                local_size2,baseoffset,local_offset2)
-
-      mpirank_top11 = MOD((idx-1)/nb,mpiprocs)
-      mpirank_top21 = MOD((idx-2)/nb,mpiprocs)
-
-      top11_offset = local_index(idx,mpirank_top11,mpiprocs,nb,0)
-      top21_offset = local_index(idx-1,mpirank_top21,mpiprocs,nb,0)
-
-      if (mpirank_top11 .eq. mpirank) then
-        top11 = a(top11_offset,2)
-        top12 = a(top11_offset,1)
-      else
-        top11 = 0.0d0
-        top12 = 0.0d0
-      end if
-
-      if (mpirank_top21 .eq. mpirank) then
-        top21 = a(top21_offset,2)
-        top22 = a(top21_offset,1)
-      else
-        top21 = 0.0d0
-        top22 = 0.0d0
-      end if
-
-      ! calculate 3 dot products
-      dot11 = ddot(local_size1,a(local_offset1,2),1,a(local_offset1,2),1)
-      dot12 = ddot(local_size1,a(local_offset1,2),1,a(local_offset1,1),1)
-      dot22 = ddot(local_size2,a(local_offset2,1),1,a(local_offset2,1),1)
-
-      ! store results in work buffer
-      work(1) = top11
-      work(2) = dot11
-      work(3) = top12
-      work(4) = dot12
-      work(5) = top21
-      work(6) = top22
-      work(7) = dot22
-      work(8) = 0.0d0 ! fill up buffer
-
-      ! exchange partial results
-      call mpi_allreduce(work, seed, 8, mpi_real8, mpi_sum, &
-                         mpicomm, mpierr)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm_seed")
-#endif
-    end subroutine qr_pdlarfg2_1dcomm_seed
-
-    logical function qr_pdlarfg2_1dcomm_check(seed,eps)
-      use precision
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input variables
-      real(kind=rk)    ::  seed(*)
-      integer(kind=ik) :: eps
-
-      ! local scalars
-      real(kind=rk)    :: epsd,first,second,first_second,estimate
-      logical          :: accurate
-      real(kind=rk)    :: dot11,dot12,dot22
-      real(kind=rk)    :: top11,top12,top21,top22
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfg2_1dcomm_check")
-#endif
-
-      EPSD = EPS
-
-      top11 = seed(1)
-      dot11 = seed(2)
-      top12 = seed(3)
-      dot12 = seed(4)
-
-      top21 = seed(5)
-      top22 = seed(6)
-      dot22 = seed(7)
-
-      ! reconstruct the whole inner products
-      ! (including squares of the top elements)
-      first = dot11 + top11*top11
-      second = dot22 + top22*top22 + top12*top12
-      first_second = dot12 + top11*top12
-
-      ! zero Householder vector (zero norm) case
-      if (first*second .eq. 0.0d0) then
-        qr_pdlarfg2_1dcomm_check = .false.
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm_check")
-#endif
-
-        return
-      end if
-
-      estimate = abs((first_second*first_second)/(first*second))
-
-      !print *,'estimate:',estimate
-
-      ! if accurate the following check holds
-      accurate = (estimate .LE. (epsd/(1.0d0+epsd)))
-
-      qr_pdlarfg2_1dcomm_check = accurate
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm_check")
-#endif
-
-    end function qr_pdlarfg2_1dcomm_check
-
-    ! id=0: first vector
-    ! id=1: second vector
-    subroutine qr_pdlarfg2_1dcomm_vector(x,incx,tau,seed,n,nb,idx,id,rev,mpicomm)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input variables (local)
-      integer(kind=ik)        :: incx
-      real(kind=rk)           :: x(*),seed(*),tau
-
-      ! input variables (global)
-      integer(kind=ik)        :: n,nb,idx,id,rev,mpicomm
-
-      ! output variables (global)
-
-      ! external functions
-      real(kind=rk), external :: dlapy2
-      external                :: dscal
-
-      ! local scalars
-      integer(kind=ik)        :: mpirank,mpirank_top,mpiprocs,mpierr
-      real(kind=rk)           :: alpha,dot,beta,xnorm
-      integer(kind=ik)        :: local_size,baseoffset,local_offset,top,topidx
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfg2_1dcomm_vector")
-#endif
-
-
-      call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-      call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-
-      call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
-                                    local_size,baseoffset,local_offset)
-
-      local_offset = local_offset * incx
-
-      ! Processor id for global index of top element
-      mpirank_top = MOD((idx-1)/nb,mpiprocs)
-      if (mpirank .eq. mpirank_top) then
-        topidx = local_index(idx,mpirank_top,mpiprocs,nb,0)
-        top = 1+(topidx-1)*incx
-      end if
-
-      alpha = seed(id*5+1)
-      dot = seed(id*5+2)
-
-      xnorm = sqrt(dot)
-
-      if (xnorm .eq. 0.0d0) then
-        ! H = I
-
-        tau = 0.0d0
-      else
-        ! General case
-
-        beta = sign(dlapy2(alpha, xnorm), alpha)
-        tau = (beta+alpha) / beta
-
-        !print *,'hg2',tau,xnorm,alpha
-
-        call dscal(local_size, 1.0d0/(beta+alpha), &
-                   x(local_offset), incx)
-
-        ! TODO: reimplement norm rescale method of
-        ! original PDLARFG using mpi?
-
-        if (mpirank .eq. mpirank_top) then
-          x(top) = -beta
-        end if
-
-        seed(8) = beta
-      end if
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm_vector")
-#endif
-
-    end subroutine qr_pdlarfg2_1dcomm_vector
-
-    subroutine qr_pdlarfg2_1dcomm_update(v,incv,baseidx,a,lda,seed,n,idx,nb,rev,mpicomm)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input variables (local)
-      integer(kind=ik)   :: incv,lda
-      real(kind=rk)      :: v(*),a(lda,*),seed(*)
-
-      ! input variables (global)
-      integer(kind=ik)   :: n,baseidx,idx,nb,rev,mpicomm
-
-      ! output variables (global)
-
-      ! external functions
-      external daxpy
-
-      ! local scalars
-      integer(kind=ik)   :: mpirank,mpiprocs,mpierr
-      integer(kind=ik)   :: local_size,local_offset,baseoffset
-      real(kind=rk)      :: z,coeff,beta
-      real(kind=rk)      :: dot11,dot12,dot22
-      real(kind=rk)      :: top11,top12,top21,top22
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfg2_1dcomm_update")
-#endif
-      call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-      call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-
-
-      ! seed should be updated by previous householder generation
-      ! Update inner product of this column and next column vector
-      top11 = seed(1)
-      dot11 = seed(2)
-      top12 = seed(3)
-      dot12 = seed(4)
-
-      top21 = seed(5)
-      top22 = seed(6)
-      dot22 = seed(7)
-      beta = seed(8)
-
-      call local_size_offset_1d(n,nb,baseidx,idx,rev,mpirank,mpiprocs, &
-                                local_size,baseoffset,local_offset)
-      baseoffset = baseoffset * incv
-
-      ! zero Householder vector (zero norm) case
-      if (beta .eq. 0.0d0) then
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm_update")
-#endif
-        return
-      end if
-      z = (dot12 + top11 * top12) / beta + top12
-
-      !print *,'hg2 update:',baseidx,idx,mpirank,local_size
-
-      call daxpy(local_size, -z, v(baseoffset),1, a(local_offset,1),1)
-
-      ! prepare a full dot22 for update
-      dot22 = dot22 + top22*top22
-
-      ! calculate coefficient
-      COEFF = z / (top11 + beta)
-
-      ! update inner product of next vector
-      dot22 = dot22 - coeff * (2*dot12 - coeff*dot11)
-
-      ! update dot12 value to represent update with first vector
-      ! (needed for T matrix)
-      dot12 = dot12 - COEFF * dot11
-
-      ! update top element of next vector
-      top22 = top22 - coeff * top21
-      seed(6) = top22
-
-      ! restore separated dot22 for vector generation
-      seed(7) = dot22  - top22*top22
-
-      !------------------------------------------------------
-      ! prepare elements for T matrix
-      seed(4) = dot12
-
-      ! prepare dot matrix for fuse element of T matrix
-      ! replace top11 value with -beta1
-      seed(1) = beta
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm_update")
-#endif
-
-    end subroutine qr_pdlarfg2_1dcomm_update
-
-    ! run this function after second vector
-    subroutine qr_pdlarfg2_1dcomm_finalize_tmatrix(seed,tau,t,ldt)
-      use precision
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      integer(kind=ik)  :: ldt
-      real(kind=rk)     :: seed(*),t(ldt,*),tau(*)
-      real(kind=rk)     :: dot12,beta1,top21,beta2
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfg2_1dcomm_finalize_tmatrix")
-#endif
-
-      beta1 = seed(1)
-      dot12 = seed(4)
-      top21 = seed(5)
-      beta2 = seed(8)
-
-      !print *,'beta1 beta2',beta1,beta2
-
-      dot12 = dot12 / beta2 + top21
-      dot12 = -(dot12 / beta1)
-
-      t(1,1) = tau(1)
-      t(1,2) = dot12
-      t(2,2) = tau(2)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm_finalize_tmatrix")
-#endif
-
-    end subroutine qr_pdlarfg2_1dcomm_finalize_tmatrix
-
-    subroutine qr_pdlarfgk_1dcomm(a,lda,tau,t,ldt,v,ldv,baseidx,work,lwork,m,k,idx,mb,PQRPARAM,rev,mpicomm,actualk)
-      use precision
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! parameter setup
-
-      ! input variables (local)
-      integer(kind=ik)    :: lda,lwork,ldv,ldt
-      real(kind=rk)       :: a(lda,*),v(ldv,*),tau(*),work(*),t(ldt,*)
-
-      ! input variables (global)
-      integer(kind=ik)    :: m,k,idx,baseidx,mb,rev,mpicomm
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      integer(kind=ik)    ::PQRPARAM(*)
-#else
-      integer(kind=ik)    :: PQRPARAM(:)
-#endif
-      ! output variables (global)
-      integer(kind=ik)    :: actualk
-
-      ! local scalars
-      integer(kind=ik)    :: ivector
-      real(kind=rk)       :: pdlarfg_size(1),pdlarf_size(1)
-      real(kind=rk)       :: pdlarfgk_1dcomm_seed_size(1),pdlarfgk_1dcomm_check_size(1)
-      real(kind=rk)       :: pdlarfgk_1dcomm_update_size(1)
-      integer(kind=ik)    :: seedC_size,seedC_offset
-      integer(kind=ik)    :: seedD_size,seedD_offset
-      integer(kind=ik)    :: work_offset
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfgk_1dcomm")
-#endif
-
-      seedC_size = k*k
-      seedC_offset = 1
-      seedD_size = k*k
-      seedD_offset = seedC_offset + seedC_size
-      work_offset = seedD_offset + seedD_size
-
-      if (lwork .eq. -1) then
-        call qr_pdlarfg_1dcomm(a,1,tau(1),pdlarfg_size(1),-1,m,baseidx,mb,PQRPARAM(4),rev,mpicomm)
-
-        call qr_pdlarfl_1dcomm(v,1,baseidx,a,lda,tau(1),pdlarf_size(1),-1,m,k,baseidx,mb,rev,mpicomm)
-        call qr_pdlarfgk_1dcomm_seed(a,lda,baseidx,pdlarfgk_1dcomm_seed_size(1),-1,work,work,m,k,mb,mpicomm)
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-        !call qr_pdlarfgk_1dcomm_check(work,work,k,PQRPARAM,pdlarfgk_1dcomm_check_size(1),-1,actualk)
-        call qr_pdlarfgk_1dcomm_check_improved(work,work,k,PQRPARAM,pdlarfgk_1dcomm_check_size(1),-1,actualk)
-
-#else
-        !call qr_pdlarfgk_1dcomm_check(work,work,k,PQRPARAM(:),pdlarfgk_1dcomm_check_size(1),-1,actualk)
-        call qr_pdlarfgk_1dcomm_check_improved(work,work,k,PQRPARAM(:),pdlarfgk_1dcomm_check_size(1),-1,actualk)
-#endif
-        call qr_pdlarfgk_1dcomm_update(a,lda,baseidx,pdlarfgk_1dcomm_update_size(1),-1,work,work,k,k,1,work,m,mb,rev,mpicomm)
-        work(1) = max(pdlarfg_size(1),pdlarf_size(1),pdlarfgk_1dcomm_seed_size(1),pdlarfgk_1dcomm_check_size(1), &
-                        pdlarfgk_1dcomm_update_size(1)) + real(seedC_size + seedD_size, kind=rk)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfgk_1dcomm")
-#endif
-
-        return
-      end if
-
-      call qr_pdlarfgk_1dcomm_seed(a(1,1),lda,idx,work(work_offset),lwork,work(seedC_offset),work(seedD_offset),m,k,mb,mpicomm)
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      !call qr_pdlarfgk_1dcomm_check(work(seedC_offset),work(seedD_offset),k,PQRPARAM,work(work_offset),lwork,actualk)
-      call qr_pdlarfgk_1dcomm_check_improved(work(seedC_offset),work(seedD_offset),k,PQRPARAM,work(work_offset),lwork,actualk)
-#else
-      !call qr_pdlarfgk_1dcomm_check(work(seedC_offset),work(seedD_offset),k,PQRPARAM(:),work(work_offset),lwork,actualk)
-      call qr_pdlarfgk_1dcomm_check_improved(work(seedC_offset),work(seedD_offset),k,PQRPARAM(:),work(work_offset),lwork,actualk)
-#endif
-      !print *,'possible rank:', actualk
-
-      ! override useful for debugging
-      !actualk = 1
-      !actualk = k
-      !actualk= min(actualk,2)
-      do ivector=1,actualk
-        call qr_pdlarfgk_1dcomm_vector(a(1,k-ivector+1),1,idx,tau(k-ivector+1), &
-                                          work(seedC_offset),work(seedD_offset),k, &
-                                          ivector,m,mb,rev,mpicomm)
-
-        call qr_pdlarfgk_1dcomm_update(a(1,1),lda,idx,work(work_offset),lwork,work(seedC_offset), &
-                                          work(seedD_offset),k,actualk,ivector,tau, &
-                                          m,mb,rev,mpicomm)
-
-        call qr_pdlarfg_copy_1dcomm(a(1,k-ivector+1),1, &
-                                       v(1,k-ivector+1),1, &
-                                       m,baseidx,idx-ivector+1,mb,1,mpicomm)
-      end do
-
-      ! generate final T matrix and convert preliminary tau values into real ones
-      call qr_pdlarfgk_1dcomm_generateT(work(seedC_offset),work(seedD_offset),k,actualk,tau,t,ldt)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfgk_1dcomm")
-#endif
-    end subroutine qr_pdlarfgk_1dcomm
-
-    subroutine qr_pdlarfgk_1dcomm_seed(a,lda,baseidx,work,lwork,seedC,seedD,m,k,mb,mpicomm)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! parameter setup
-
-      ! input variables (local)
-      integer(kind=ik)   :: lda,lwork
-      real(kind=rk)      :: a(lda,*), work(*)
-
-      ! input variables (global)
-      integer(kind=ik)   :: m,k,baseidx,mb,mpicomm
-      real(kind=rk)      :: seedC(k,*),seedD(k,*)
-
-      ! output variables (global)
-
-      ! derived input variables from QR_PQRPARAM
-
-      ! local scalars
-      integer(kind=ik)   :: mpierr,mpirank,mpiprocs,mpirank_top
-      integer(kind=ik)   :: icol,irow,lidx,remsize
-      integer(kind=ik)   :: remaining_rank
-
-      integer(kind=ik)   :: C_size,D_size,sendoffset,recvoffset,sendrecv_size
-      integer(kind=ik)   :: localoffset,localsize,baseoffset
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfgk_1dcomm_seed")
-#endif
-      call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-      call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-
-      C_size = k*k
-      D_size = k*k
-      sendoffset = 1
-      sendrecv_size = C_size+D_size
-      recvoffset = sendoffset + sendrecv_size
-
-      if (lwork .eq. -1) then
-        work(1) = DBLE(2*sendrecv_size)
-        return
-      end if
-
-      ! clear buffer
-      work(sendoffset:sendoffset+sendrecv_size-1)=0.0d0
-
-      ! collect C part
-      do icol=1,k
-
-        remaining_rank = k
-        do while (remaining_rank .gt. 0)
-          irow = k - remaining_rank + 1
-          lidx = baseidx - remaining_rank + 1
-
-          ! determine chunk where the current top element is located
-          mpirank_top = MOD((lidx-1)/mb,mpiprocs)
-
-          ! limit max number of remaining elements of this chunk to the block
-          ! distribution parameter
-          remsize = min(remaining_rank,mb)
-
-          ! determine the number of needed elements in this chunk
-          call local_size_offset_1d(lidx+remsize-1,mb, &
-                                    lidx,lidx,0, &
-                                    mpirank_top,mpiprocs, &
-                                    localsize,baseoffset,localoffset)
-
-          !print *,'local rank',localsize,localoffset
-
-          if (mpirank .eq. mpirank_top) then
-            ! copy elements to buffer
-            work(sendoffset+(icol-1)*k+irow-1:sendoffset+(icol-1)*k+irow-1+localsize-1) &
-                          = a(localoffset:localoffset+remsize-1,icol)
-          end if
-
-          ! jump to next chunk
-          remaining_rank = remaining_rank - localsize
-        end do
-      end do
-
-      ! collect D part
-      call local_size_offset_1d(m,mb,baseidx-k,baseidx-k,1, &
-          						  mpirank,mpiprocs, &
-          						  localsize,baseoffset,localoffset)
-
-      !print *,'localsize',localsize,localoffset
-      if (localsize > 0) then
-        call dsyrk("Upper", "Trans", k, localsize, &
-                     1.0d0, a(localoffset,1), lda, &
-                     0.0d0, work(sendoffset+C_size), k)
-      else
-        work(sendoffset+C_size:sendoffset+C_size+k*k-1) = 0.0d0
-      end if
-
-      ! TODO: store symmetric part more efficiently
-
-      ! allreduce operation on results
-      call mpi_allreduce(work(sendoffset),work(recvoffset),sendrecv_size, &
-                         mpi_real8,mpi_sum,mpicomm,mpierr)
-
-      ! unpack result from buffer into seedC and seedD
-      seedC(1:k,1:k) = 0.0d0
-      do icol=1,k
-        seedC(1:k,icol) = work(recvoffset+(icol-1)*k:recvoffset+icol*k-1)
-      end do
-
-      seedD(1:k,1:k) = 0.0d0
-      do icol=1,k
-        seedD(1:k,icol) = work(recvoffset+C_size+(icol-1)*k:recvoffset+C_size+icol*k-1)
-      end do
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfgk_1dcomm_seed")
-#endif
-
-    end subroutine qr_pdlarfgk_1dcomm_seed
-
-    ! k is assumed to be larger than two
-    subroutine qr_pdlarfgk_1dcomm_check_improved(seedC,seedD,k,PQRPARAM,work,lwork,possiblerank)
-      use precision
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input variables (global)
-      integer(kind=ik)   :: k,lwork
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      integer(kind=ik)   :: PQRPARAM(*)
-
-#else
-      integer(kind=ik)   :: PQRPARAM(:)
-#endif
-      real(kind=rk)      :: seedC(k,*),seedD(k,*),work(k,*)
-
-      ! output variables (global)
-      integer(kind=ik)   :: possiblerank
-
-      ! derived input variables from QR_PQRPARAM
-      integer(kind=ik)   :: eps
-
-      ! local variables
-      integer(kind=ik)   :: i,j,l
-      real(kind=rk)      :: sum_squares,diagonal_square,relative_error,epsd,diagonal_root
-      real(kind=rk)      :: dreverse_matrix_work(1)
-
-      ! external functions
-      real(kind=rk), external :: ddot,dlapy2,dnrm2
-      external                :: dscal
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfgk_1dcomm_check_improved")
-#endif
-
-      if (lwork .eq. -1) then
-        call reverse_matrix_local(1,k,k,work,k,dreverse_matrix_work,-1)
-        work(1,1) = DBLE(k*k) + dreverse_matrix_work(1)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check_improved")
-#endif
-        return
-      end if
-
-      eps = PQRPARAM(3)
-
-      if (eps .eq. 0) then
-        possiblerank = k
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check_improved")
-#endif
-        return
-      end if
-
-      epsd = DBLE(eps)
-
-      ! build complete inner product from seedC and seedD
-      ! copy seedD to work
-      work(:,1:k) = seedD(:,1:k)
-
-      ! add inner products of seedC to work
-      call dsyrk("Upper", "Trans", k, k, &
-                 1.0d0, seedC(1,1), k, &
-                 1.0d0, work, k)
-
-      ! TODO: optimize this part!
-      call reverse_matrix_local(0,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
-      call reverse_matrix_local(1,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
-
-      ! transpose matrix
-      do i=1,k
-        do j=i+1,k
-          work(i,j) = work(j,i)
-        end do
-      end do
-
-
-      ! do cholesky decomposition
-      i = 0
-      do while ((i .lt. k))
-        i = i + 1
-
-        diagonal_square = abs(work(i,i))
-        diagonal_root  = sqrt(diagonal_square)
-
-        ! zero Householder vector (zero norm) case
-        if ((abs(diagonal_square) .eq. 0.0d0) .or. (abs(diagonal_root) .eq. 0.0d0)) then
-          possiblerank = max(i-1,1)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check_improved")
-#endif
-          return
-        end if
-
-        ! check if relative error is bounded for each Householder vector
-        ! Householder i is stable iff Househoulder i-1 is "stable" and the accuracy criterion
-        ! holds.
-        ! first Householder vector is considered as "stable".
-
-        do j=i+1,k
-          work(i,j) = work(i,j) / diagonal_root
-          do l=i+1,j
-            work(l,j) = work(l,j) - work(i,j) * work(i,l)
-          end do
-        end do
-        !print *,'cholesky step done'
-
-        ! build sum of squares
-        if (i .eq. 1) then
-          sum_squares = 0.0d0
-        else
-          sum_squares = ddot(i-1,work(1,i),1,work(1,i),1)
-        end if
-        !relative_error = sum_squares / diagonal_square
-        !print *,'error ',i,sum_squares,diagonal_square,relative_error
-
-        if (sum_squares .ge. (epsd * diagonal_square)) then
-          possiblerank = max(i-1,1)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check_improved")
-#endif
-          return
-        end if
-      end do
-
-      possiblerank = i
-      !print *,'possible rank', possiblerank
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check_improved")
-#endif
-
-    end subroutine qr_pdlarfgk_1dcomm_check_improved
-
-    ! TODO: zero Householder vector (zero norm) case
-    ! - check alpha values as well (from seedC)
-    subroutine qr_pdlarfgk_1dcomm_check(seedC,seedD,k,PQRPARAM,work,lwork,possiblerank)
-      use precision
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! parameter setup
-
-      ! input variables (local)
-
-      ! input variables (global)
-      integer(kind=ik)   :: k,lwork
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      integer(kind=ik)   :: PQRPARAM(*)
-#else
-      integer(kind=ik)   :: PQRPARAM(:)
-#endif
-      real(kind=rk)      :: seedC(k,*),seedD(k,*),work(k,*)
-
-      ! output variables (global)
-      integer(kind=ik)   :: possiblerank
-
-      ! derived input variables from QR_PQRPARAM
-      integer(kind=ik)   :: eps
-
-      ! local scalars
-      integer(kind=ik)   :: icol,isqr,iprod
-      real(kind=rk)      :: epsd,sum_sqr,sum_products,diff,temp,ortho,ortho_sum
-      real(kind=rk)      :: dreverse_matrix_work(1)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("qr_pdlarfgk_1dcomm_check")
-#endif
-      if (lwork .eq. -1) then
-        call reverse_matrix_local(1,k,k,work,k,dreverse_matrix_work,-1)
-        work(1,1) = DBLE(k*k) + dreverse_matrix_work(1)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check")
-#endif
-
-        return
-      end if
-
-      eps = PQRPARAM(3)
-
-      if (eps .eq. 0) then
-        possiblerank = k
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check")
-#endif
-        return
-      end if
-
-      epsd = DBLE(eps)
-
-
-      ! copy seedD to work
-      work(:,1:k) = seedD(:,1:k)
-
-      ! add inner products of seedC to work
-      call dsyrk("Upper", "Trans", k, k, &
-                 1.0d0, seedC(1,1), k, &
-                 1.0d0, work, k)
-
-      ! TODO: optimize this part!
-      call reverse_matrix_local(0,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
-      call reverse_matrix_local(1,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
-
-      ! transpose matrix
-      do icol=1,k
-      	do isqr=icol+1,k
-          work(icol,isqr) = work(isqr,icol)
-      	end do
-      end do
-
-      ! work contains now the full inner product of the global (sub-)matrix
-      do icol=1,k
-        ! zero Householder vector (zero norm) case
-        if (abs(work(icol,icol)) .eq. 0.0d0) then
-          !print *,'too small ', icol, work(icol,icol)
-          possiblerank = max(icol,1)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check")
-#endif
-          return
-        end if
-
-        sum_sqr = 0.0d0
-        do isqr=1,icol-1
-          sum_products = 0.0d0
-          do iprod=1,isqr-1
-            sum_products = sum_products + work(iprod,isqr)*work(iprod,icol)
-          end do
-
-          !print *,'divisor',icol,isqr,work(isqr,isqr)
-          temp = (work(isqr,icol) - sum_products)/work(isqr,isqr)
-          work(isqr,icol) = temp
-          sum_sqr = sum_sqr + temp*temp
-        end do
-
-        ! calculate diagonal value
-        diff = work(icol,icol) - sum_sqr
-        if (diff .lt. 0.0d0) then
-          ! we definitely have a problem now
-          possiblerank = icol-1 ! only decompose to previous column (including)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check")
-#endif
-          return
-        end if
-        work(icol,icol) = sqrt(diff)
-        ! calculate orthogonality
-        ortho = 0.0d0
-        do isqr=1,icol-1
-          ortho_sum = 0.0d0
-          do iprod=isqr,icol-1
-            temp = work(isqr,iprod)*work(isqr,iprod)
-            !print *,'ortho ', work(iprod,iprod)
-            temp = temp / (work(iprod,iprod)*work(iprod,iprod))
-            ortho_sum = ortho_sum + temp
-          end do
-          ortho = ortho + ortho_sum * (work(isqr,icol)*work(isqr,icol))
-        end do
-
-        ! ---------------- with division by zero ----------------------- !
-
-        !ortho = ortho / diff;
-
-        ! if current estimate is not accurate enough, the following check holds
-        !if (ortho .gt. epsd) then
-        !    possiblerank = icol-1 ! only decompose to previous column (including)
-        !    return
-        !end if
-
-        ! ---------------- without division by zero ----------------------- !
-
-        ! if current estimate is not accurate enough, the following check holds
-        if (ortho .gt. epsd * diff) then
-          possiblerank = icol-1 ! only decompose to previous column (including)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check")
-#endif
-          return
-        end if
-      end do
-
-      ! if we get to this point, the accuracy condition holds for the whole block
-      possiblerank = k
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check")
-#endif
-    end subroutine qr_pdlarfgk_1dcomm_check
-
-    !sidx: seed idx
-    !k: max rank used during seed phase
-    !rank: actual rank (k >= rank)
-    subroutine qr_pdlarfgk_1dcomm_vector(x,incx,baseidx,tau,seedC,seedD,k,sidx,n,nb,rev,mpicomm)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input variables (local)
-      integer(kind=ik)  :: incx
-      real(kind=rk)     :: x(*),tau
-
-      ! input variables (global)
-      integer(kind=ik)  :: n,nb,baseidx,rev,mpicomm,k,sidx
-      real(kind=rk)     :: seedC(k,*),seedD(k,*)
-
-      ! output variables (global)
-
-      ! external functions
-      real(kind=rk), external :: dlapy2,dnrm2
-      external                :: dscal
-
-      ! local scalars
-      integer(kind=ik)   :: mpirank,mpirank_top,mpiprocs,mpierr
-      real(kind=rk)      :: alpha,dot,beta,xnorm
-      integer(kind=ik)   :: local_size,baseoffset,local_offset,top,topidx
-      integer(kind=ik)   :: lidx
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("qr_pdlarfgk_1dcomm_vector")
-#endif
-      call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-      call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-
-      lidx = baseidx-sidx+1
-      call local_size_offset_1d(n,nb,baseidx,lidx-1,rev,mpirank,mpiprocs, &
-          						  local_size,baseoffset,local_offset)
-
-      local_offset = local_offset * incx
-
-      ! Processor id for global index of top element
-      mpirank_top = MOD((lidx-1)/nb,mpiprocs)
-      if (mpirank .eq. mpirank_top) then
-        topidx = local_index((lidx),mpirank_top,mpiprocs,nb,0)
-        top = 1+(topidx-1)*incx
-      end if
-
-      alpha = seedC(k-sidx+1,k-sidx+1)
-      dot = seedD(k-sidx+1,k-sidx+1)
-      ! assemble actual norm from both seed parts
-      xnorm = dlapy2(sqrt(dot), dnrm2(k-sidx,seedC(1,k-sidx+1),1))
-
-      if (xnorm .eq. 0.0d0) then
-        tau = 0.0d0
-      else
-        ! General case
-
-        beta = sign(dlapy2(alpha, xnorm), alpha)
-        ! store a preliminary version of beta in tau
-        tau = beta
-
-        ! update global part
-        call dscal(local_size, 1.0d0/(beta+alpha), &
-                     x(local_offset), incx)
-
-        ! do not update local part here due to
-        ! dependency of c vector during update process
-
-        ! TODO: reimplement norm rescale method of
-        ! original PDLARFG using mpi?
-
-        if (mpirank .eq. mpirank_top) then
-          x(top) = -beta
-        end if
-      end if
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_vector")
-#endif
-
-    end subroutine qr_pdlarfgk_1dcomm_vector
-
-    !k: original max rank used during seed function
-    !rank: possible rank as from check function
-    ! TODO: if rank is less than k, reduce buffersize in such a way
-    ! that only the required entries for the next pdlarfg steps are
-    ! computed
-    subroutine qr_pdlarfgk_1dcomm_update(a,lda,baseidx,work,lwork,seedC,seedD,k,rank,sidx,tau,n,nb,rev,mpicomm)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! parameter setup
-      INTEGER(kind=ik), parameter :: gmode_ = 1,rank_ = 2,eps_ = 3, upmode1_ = 4
-
-      ! input variables (local)
-      integer(kind=ik)            :: lda,lwork
-      real(kind=rk)               :: a(lda,*),work(*)
-
-      ! input variables (global)
-      integer(kind=ik)            :: k,rank,sidx,n,baseidx,nb,rev,mpicomm
-      real(kind=rk)               :: beta
-
-      ! output variables (global)
-      real(kind=rk)               :: seedC(k,*),seedD(k,*),tau(*)
-
-      ! derived input variables from QR_PQRPARAM
-
-      ! local scalars
-      real(kind=rk)               :: alpha
-      integer(kind=ik)            :: coffset,zoffset,yoffset,voffset,buffersize
-      integer(kind=ik)            :: mpirank,mpierr,mpiprocs,mpirank_top
-      integer(kind=ik)            :: localsize,baseoffset,localoffset,topidx
-      integer(kind=ik)            :: lidx
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("qr_pdlarfgk_1dcomm_update")
-#endif
-      if (lwork .eq. -1) then
-        ! buffer for c,z,y,v
-        work(1) = 4*k
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_update")
-#endif
-
-        return
-      end if
-
-      ! nothing to update anymore
-      if (sidx .gt. rank) then
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_update")
-#endif
-        return
-      endif
-      call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-      call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-
-      lidx = baseidx-sidx
-      if (lidx .lt. 1) then
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_update")
-#endif
-        return
-      endif
-
-      call local_size_offset_1d(n,nb,baseidx,lidx,rev,mpirank,mpiprocs, &
-                                localsize,baseoffset,localoffset)
-
-      coffset = 1
-      zoffset = coffset + k
-      yoffset = zoffset + k
-      voffset = yoffset + k
-      buffersize = k - sidx
-
-      ! finalize tau values
-      alpha = seedC(k-sidx+1,k-sidx+1)
-      beta = tau(k-sidx+1)
-
-      ! zero Householder vector (zero norm) case
-      !print *,'k update: alpha,beta',alpha,beta
-      if ((beta .eq. 0.0d0) .or. (alpha .eq. 0.0d0))  then
-        tau(k-sidx+1) = 0.0d0
-        seedC(k,k-sidx+1) = 0.0d0
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_update")
-#endif
-        return
-      end if
-
-      tau(k-sidx+1) = (beta+alpha) / beta
-
-      ! ---------------------------------------
-      ! calculate c vector (extra vector or encode in seedC/seedD?
-      work(coffset:coffset+buffersize-1) = seedD(1:buffersize,k-sidx+1)
-      call dgemv("Trans", buffersize+1, buffersize, &
-                 1.0d0,seedC(1,1),k,seedC(1,k-sidx+1),1, &
-                 1.0d0,work(coffset),1)
-
-      ! calculate z using tau,seedD,seedC and c vector
-      work(zoffset:zoffset+buffersize-1) = seedC(k-sidx+1,1:buffersize)
-      call daxpy(buffersize, 1.0d0/beta, work(coffset), 1, work(zoffset), 1)
-
-      ! update A1(local copy) and generate part of householder vectors for use
-      call daxpy(buffersize, -1.0d0, work(zoffset),1,seedC(k-sidx+1,1),k)
-      call dscal(buffersize, 1.0d0/(alpha+beta), seedC(1,k-sidx+1),1)
-      call dger(buffersize, buffersize, -1.0d0, seedC(1,k-sidx+1),1, work(zoffset), 1, seedC(1,1), k)
-
-      ! update A global (householder vector already generated by pdlarfgk)
-      mpirank_top = MOD(lidx/nb,mpiprocs)
-      if (mpirank .eq. mpirank_top) then
-        ! handle first row separately
-        topidx = local_index(lidx+1,mpirank_top,mpiprocs,nb,0)
-        call daxpy(buffersize,-1.0d0,work(zoffset),1,a(topidx,1),lda)
-      end if
-
-      call dger(localsize, buffersize,-1.0d0, &
-                a(localoffset,k-sidx+1),1,work(zoffset),1, &
-                a(localoffset,1),lda)
-
-      ! update D (symmetric) => two buffer vectors of size rank
-      ! generate y vector
-      work(yoffset:yoffset+buffersize-1) = 0.d0
-      call daxpy(buffersize,1.0d0/(alpha+beta),work(zoffset),1,work(yoffset),1)
-
-      ! generate v vector
-      work(voffset:voffset+buffersize-1) = seedD(1:buffersize,k-sidx+1)
-      call daxpy(buffersize, -0.5d0*seedD(k-sidx+1,k-sidx+1), work(yoffset), 1, work(voffset),1)
-
-      ! symmetric update of D using y and v
-      call dsyr2("Upper", buffersize,-1.0d0, &
-                     work(yoffset),1,work(voffset),1, &
-                     seedD(1,1), k)
-
-      ! prepare T matrix inner products
-      ! D_k(1:k,k+1:n) = D_(k-1)(1:k,k+1:n) - D_(k-1)(1:k,k) * y'
-      ! store coefficient 1.0d0/(alpha+beta) in C diagonal elements
-      call dger(k-sidx,sidx,-1.0d0,work(yoffset),1,seedD(k-sidx+1,k-sidx+1),k,seedD(1,k-sidx+1),k)
-      seedC(k,k-sidx+1) = 1.0d0/(alpha+beta)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_update")
-#endif
-    end subroutine qr_pdlarfgk_1dcomm_update
-
-    subroutine qr_pdlarfgk_1dcomm_generateT(seedC,seedD,k,actualk,tau,t,ldt)
-      use precision
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      integer(kind=ik)  :: k,actualk,ldt
-      real(kind=rk)     :: seedC(k,*),seedD(k,*),tau(*),t(ldt,*)
-
-      integer(kind=ik)  :: irow,icol
-      real(kind=rk)     :: column_coefficient
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("qr_pdlarfgk_1dcomm_generateT")
-#endif
-
-      !print *,'reversed on the fly T generation NYI'
-
-      do icol=1,actualk-1
-        ! calculate inner product of householder vector parts in seedC
-        ! (actually calculating more than necessary, if actualk < k)
-        ! => a lot of junk from row 1 to row k-actualk
-        call dtrmv('Upper','Trans','Unit',k-icol,seedC(1,1),k,seedC(1,k-icol+1),1)
-
-        ! add scaled D parts to current column of C (will become later T rows)
-        column_coefficient = seedC(k,k-icol+1)
-        do irow=k-actualk+1,k-1
-          seedC(irow,k-icol+1) = ( seedC(irow,k-icol+1) ) +  ( seedD(irow,k-icol+1) * column_coefficient * seedC(k,irow) )
-        end do
-      end do
-
-      call qr_dlarft_kernel(actualk,tau(k-actualk+1),seedC(k-actualk+1,k-actualk+2),k,t(k-actualk+1,k-actualk+1),ldt)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_generateT")
-#endif
-
-    end subroutine qr_pdlarfgk_1dcomm_generateT
-
-    !direction=0: pack into work buffer
-    !direction=1: unpack from work buffer
-    subroutine qr_pdgeqrf_pack_unpack(v,ldv,work,lwork,m,n,mb,baseidx,rowidx,rev,direction,mpicomm)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input variables (local)
-      integer(kind=ik)   :: ldv,lwork
-      real(kind=rk)      :: v(ldv,*), work(*)
-
-      ! input variables (global)
-      integer(kind=ik)   :: m,n,mb,baseidx,rowidx,rev,direction,mpicomm
-
-      ! output variables (global)
-
-      ! local scalars
-      integer(kind=ik)   :: mpierr,mpirank,mpiprocs
-      integer(kind=ik)   :: buffersize,icol
-      integer(kind=ik)   :: local_size,baseoffset,offset
-
-      ! external functions
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("qr_pdgeqrf_pack_unpack")
-#endif
-      call mpi_comm_rank(mpicomm,mpirank,mpierr)
-      call mpi_comm_size(mpicomm,mpiprocs,mpierr)
-
-      call local_size_offset_1d(m,mb,baseidx,rowidx,rev,mpirank,mpiprocs, &
-                                    local_size,baseoffset,offset)
-
-      !print *,'pack/unpack',local_size,baseoffset,offset
-
-      ! rough approximate for buffer size
-      if (lwork .eq. -1) then
-        buffersize = local_size * n ! vector elements
-        work(1) = DBLE(buffersize)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdgeqrf_pack_unpack")
-#endif
-
-        return
-      end if
-
-      if (direction .eq. 0) then
-        ! copy v part to buffer (including zeros)
-        do icol=1,n
-          work(1+local_size*(icol-1):local_size*icol) = v(baseoffset:baseoffset+local_size-1,icol)
-        end do
-      else
-        ! copy v part from buffer (including zeros)
-        do icol=1,n
-          v(baseoffset:baseoffset+local_size-1,icol) = work(1+local_size*(icol-1):local_size*icol)
-        end do
-      end if
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdgeqrf_pack_unpack")
-#endif
-
-      return
-
-    end subroutine qr_pdgeqrf_pack_unpack
-
-    !direction=0: pack into work buffer
-    !direction=1: unpack from work buffer
-    subroutine qr_pdgeqrf_pack_unpack_tmatrix(tau,t,ldt,work,lwork,n,direction)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input variables (local)
-      integer(kind=ik)  :: ldt,lwork
-      real(kind=rk)     :: work(*), t(ldt,*),tau(*)
-
-      ! input variables (global)
-      integer(kind=ik)  :: n,direction
-
-      ! output variables (global)
-
-      ! local scalars
-      integer(kind=ik)  :: icol
-
-      ! external functions
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("qr_pdgeqrf_pack_unpack_tmatrix")
-#endif
-
-
-      if (lwork .eq. -1) then
-        work(1) = DBLE(n*n)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdgeqrf_pack_unpack_tmatrix")
-#endif
-
-        return
-      end if
-
-      if (direction .eq. 0) then
-        ! append t matrix to buffer (including zeros)
-        do icol=1,n
-          work(1+(icol-1)*n:icol*n) = t(1:n,icol)
-        end do
-      else
-        ! append t matrix from buffer (including zeros)
-        do icol=1,n
-          t(1:n,icol) = work(1+(icol-1)*n:icol*n)
-          tau(icol) = t(icol,icol)
-        end do
-      end if
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdgeqrf_pack_unpack_tmatrix")
-#endif
-    end subroutine qr_pdgeqrf_pack_unpack_tmatrix
-
-
-    ! TODO: encode following functionality
-    !   - Direction? BOTTOM UP or TOP DOWN ("Up", "Down")
-    !        => influences all related kernels (including DLARFT / DLARFB)
-    !   - rank-k parameter (k=1,2,...,b)
-    !        => influences possible update strategies
-    !        => parameterize the function itself? (FUNCPTR, FUNCARG)
-    !   - Norm mode? Allreduce, Allgather, AlltoAll, "AllHouse", (ALLNULL = benchmarking local kernels)
-    !   - subblocking
-    !         (maximum block size bounded by data distribution along rows)
-    !   - blocking method (householder vectors only or compact WY?)
-    !   - update strategy of trailing parts (incremental, complete)
-    !        - difference for subblocks and normal blocks? (UPDATE and UPDATESUB)
-    !        o "Incremental"
-    !        o "Full"
-    !   - final T generation (recursive: subblock wise, block wise, end) (TMERGE)
-    !        ' (implicitly given by / influences update strategies?)
-    !        => alternative: during update: iterate over sub t parts
-    !           => advantage: smaller (cache aware T parts)
-    !           => disadvantage: more memory write backs
-    !                (number of T parts * matrix elements)
-    !   - partial/sub T generation (TGEN)
-    !        o add vectors right after creation (Vector)
-    !        o add set of vectors (Set)
-    !   - bcast strategy of householder vectors to other process columns
-    !        (influences T matrix generation and trailing update
-    !         in other process columns)
-    !        o no broadcast (NONE = benchmarking?,
-    !            or not needed due to 1D process grid)
-    !        o after every housegen (VECTOR)
-    !        o after every subblk   (SUBBLOCK)
-    !        o after full local column block decomposition (BLOCK)
-    !  LOOP Housegen -> BCAST -> GENT/EXTENDT -> LOOP HouseLeft
-
-    !subroutine qr_pqrparam_init(PQRPARAM, DIRECTION, RANK, NORMMODE, &
-    !                             SUBBLK, UPDATE, TGEN, BCAST)
-    ! gmode: control communication pattern of dlarfg
-    ! maxrank: control max number of householder vectors per communication
-    ! eps: error threshold (integer)
-    ! update*: control update pattern in pdgeqr2_1dcomm ('incremental','full','merge')
-    !               merging = full update with tmatrix merging
-    ! tmerge*: 0: do not merge, 1: incremental merge, >1: recursive merge
-    !               only matters if update* == full
-    subroutine qr_pqrparam_init(pqrparam,size2d,update2d,tmerge2d,size1d,update1d,tmerge1d,maxrank,update,eps,hgmode)
-      use precision
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input
-      CHARACTER         :: update2d,update1d,update,hgmode
-      INTEGER(kind=ik)  :: size2d,size1d,maxrank,eps,tmerge2d,tmerge1d
-
-      ! output
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      INTEGER(kind=ik)  :: PQRPARAM(*)
-#else
-      INTEGER(kind=ik)  :: PQRPARAM(1:11)
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("qr_pqrparam_init")
-#endif
-
-      PQRPARAM(1) = size2d
-      PQRPARAM(2) = ichar(update2d)
-      PQRPARAM(3) = tmerge2d
-      ! TODO: broadcast T yes/no
-
-      PQRPARAM(4) = size1d
-      PQRPARAM(5) = ichar(update1d)
-      PQRPARAM(6) = tmerge1d
-
-      PQRPARAM(7) = maxrank
-      PQRPARAM(8) = ichar(update)
-      PQRPARAM(9) = eps
-      PQRPARAM(10) = ichar(hgmode)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pqrparam_init")
-#endif
-
-    end subroutine qr_pqrparam_init
-
-    subroutine qr_pdlarfg_copy_1dcomm(x,incx,v,incv,n,baseidx,idx,nb,rev,mpicomm)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input variables (local)
-      integer(kind=ik)  :: incx,incv
-      real(kind=rk)     :: x(*), v(*)
-
-      ! input variables (global)
-      integer(kind=ik)  :: baseidx,idx,rev,nb,n
-      integer(kind=ik)  :: mpicomm
-
-      ! output variables (global)
-
-      ! local scalars
-      integer(kind=ik)  :: mpierr,mpiprocs
-      integer(kind=ik)  :: mpirank,mpirank_top
-      integer(kind=ik)  :: irow,x_offset
-      integer(kind=ik)  :: v_offset,local_size
-
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("qr_pdlarfg_copy_1dcomm")
-#endif
-      call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-      call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-
-      call local_size_offset_1d(n,nb,baseidx,idx,rev,mpirank,mpiprocs, &
-                                local_size,v_offset,x_offset)
-      v_offset = v_offset * incv
-
-      !print *,'copy:',mpirank,baseidx,v_offset,x_offset,local_size
-
-      ! copy elements
-      do irow=1,local_size
-        v((irow-1)*incv+v_offset) = x((irow-1)*incx+x_offset)
-      end do
-
-      ! replace top element to build an unitary vector
-      mpirank_top = MOD((idx-1)/nb,mpiprocs)
-      if (mpirank .eq. mpirank_top) then
-        v(local_size*incv) = 1.0d0
-      end if
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfg_copy_1dcomm")
-#endif
-
-    end subroutine qr_pdlarfg_copy_1dcomm
-
-end module elpa_pdgeqrf
diff --git a/src/elpa_qr/elpa_pdlarfb.f90 b/src/elpa_qr/elpa_pdlarfb.f90
deleted file mode 100644
index 640c01333..000000000
--- a/src/elpa_qr/elpa_pdlarfb.f90
+++ /dev/null
@@ -1,639 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-module elpa_pdlarfb
-
-    use elpa1_compute
-    use qr_utils_mod
-
-    implicit none
-
-    PRIVATE
-
-    public :: qr_pdlarfb_1dcomm
-    public :: qr_pdlarft_pdlarfb_1dcomm
-    public :: qr_pdlarft_set_merge_1dcomm
-    public :: qr_pdlarft_tree_merge_1dcomm
-    public :: qr_pdlarfl_1dcomm
-    public :: qr_pdlarfl2_tmatrix_1dcomm
-    public :: qr_tmerge_pdlarfb_1dcomm
-
-    include 'mpif.h'
-
-contains
-
-subroutine qr_pdlarfb_1dcomm(m,mb,n,k,a,lda,v,ldv,tau,t,ldt,baseidx,idx,rev,mpicomm,work,lwork)
-    use precision
-    use qr_utils_mod
-
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik)  :: lda,ldv,ldt,lwork
-    real(kind=rk)     :: a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(k,*)
-
-    ! input variables (global)
-    integer(kind=ik)  :: m,mb,n,k,baseidx,idx,rev,mpicomm
-
-    ! output variables (global)
-
-    ! derived input variables from QR_PQRPARAM
-
-    ! local scalars
-    integer(kind=ik)  :: localsize,offset,baseoffset
-    integer(kind=ik)  :: mpirank,mpiprocs,mpierr
-
-        if (idx .le. 1) return
-
-    if (n .le. 0) return ! nothing to do
-
-    if (k .eq. 1) then
-        call qr_pdlarfl_1dcomm(v,1,baseidx,a,lda,tau(1), &
-                                work,lwork,m,n,idx,mb,rev,mpicomm)
-        return
-    else if (k .eq. 2) then
-        call qr_pdlarfl2_tmatrix_1dcomm(v,ldv,baseidx,a,lda,t,ldt, &
-                                 work,lwork,m,n,idx,mb,rev,mpicomm)
-        return
-    end if
-
-    if (lwork .eq. -1) then
-        work(1,1) = DBLE(2*k*n)
-        return
-    end if
-
-    !print *,'updating trailing matrix with k=',k
-
-    call MPI_Comm_rank(mpicomm,mpirank,mpierr)
-    call MPI_Comm_size(mpicomm,mpiprocs,mpierr)
-
-    ! use baseidx as idx here, otherwise the upper triangle part will be lost
-    ! during the calculation, especially in the reversed case
-    call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
-                                localsize,baseoffset,offset)
-
-    ! Z' = Y' * A
-    if (localsize .gt. 0) then
-        call dgemm("Trans","Notrans",k,n,localsize,1.0d0,v(baseoffset,1),ldv,a(offset,1),lda,0.0d0,work(1,1),k)
-    else
-        work(1:k,1:n) = 0.0d0
-    end if
-
-    ! data exchange
-    call mpi_allreduce(work(1,1),work(1,n+1),k*n,mpi_real8,mpi_sum,mpicomm,mpierr)
-
-    call qr_pdlarfb_kernel_local(localsize,n,k,a(offset,1),lda,v(baseoffset,1),ldv,t,ldt,work(1,n+1),k)
-end subroutine qr_pdlarfb_1dcomm
-
-! generalized pdlarfl2 version
-! TODO: include T merge here (seperate by "old" and "new" index)
-subroutine qr_pdlarft_pdlarfb_1dcomm(m,mb,n,oldk,k,v,ldv,tau,t,ldt,a,lda,baseidx,rev,mpicomm,work,lwork)
-    use precision
-    use qr_utils_mod
-
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik)  :: ldv,ldt,lda,lwork
-    real(kind=rk)     :: v(ldv,*),tau(*),t(ldt,*),work(k,*),a(lda,*)
-
-    ! input variables (global)
-    integer(kind=ik)  :: m,mb,n,k,oldk,baseidx,rev,mpicomm
-
-    ! output variables (global)
-
-    ! derived input variables from QR_PQRPARAM
-
-    ! local scalars
-    integer(kind=ik)  :: localsize,offset,baseoffset
-    integer(kind=ik)  :: mpirank,mpiprocs,mpierr
-    integer(kind=ik)  :: icol
-
-    integer(kind=ik)  :: sendoffset,recvoffset,sendsize
-
-    sendoffset = 1
-    sendsize = k*(k+n+oldk)
-    recvoffset = sendoffset+(k+n+oldk)
-
-    if (lwork .eq. -1) then
-        work(1,1) = DBLE(2*(k*k+k*n+oldk))
-        return
-    end if
-
-    call MPI_Comm_rank(mpicomm,mpirank,mpierr)
-    call MPI_Comm_size(mpicomm,mpiprocs,mpierr)
-
-    call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
-                                localsize,baseoffset,offset)
-
-    if (localsize .gt. 0) then
-            ! calculate inner product of householdervectors
-            call dsyrk("Upper","Trans",k,localsize,1.0d0,v(baseoffset,1),ldv,0.0d0,work(1,1),k)
-
-            ! calculate matrix matrix product of householder vectors and target matrix
-            ! Z' = Y' * A
-            call dgemm("Trans","Notrans",k,n,localsize,1.0d0,v(baseoffset,1),ldv,a(offset,1),lda,0.0d0,work(1,k+1),k)
-
-            ! TODO: reserved for T merge parts
-            work(1:k,n+k+1:n+k+oldk) = 0.0d0
-    else
-        work(1:k,1:(n+k+oldk)) = 0.0d0
-    end if
-
-    ! exchange data
-    call mpi_allreduce(work(1,sendoffset),work(1,recvoffset),sendsize,mpi_real8,mpi_sum,mpicomm,mpierr)
-
-        ! generate T matrix (pdlarft)
-        t(1:k,1:k) = 0.0d0 ! DEBUG: clear buffer first
-
-        ! T1 = tau1
-        ! | tauk  Tk-1' * (-tauk * Y(:,1,k+1:n) * Y(:,k))' |
-        ! | 0           Tk-1                           |
-        t(k,k) = tau(k)
-        do icol=k-1,1,-1
-            t(icol,icol+1:k) = -tau(icol)*work(icol,recvoffset+icol:recvoffset+k-1)
-            call dtrmv("Upper","Trans","Nonunit",k-icol,t(icol+1,icol+1),ldt,t(icol,icol+1),ldt)
-            t(icol,icol) = tau(icol)
-        end do
-
-        ! TODO: elmroth and gustavson
-
-        ! update matrix (pdlarfb)
-        ! Z' = T * Z'
-        call dtrmm("Left","Upper","Notrans","Nonunit",k,n,1.0d0,t,ldt,work(1,recvoffset+k),k)
-
-        ! A = A - Y * V'
-        call dgemm("Notrans","Notrans",localsize,n,k,-1.0d0,v(baseoffset,1),ldv,work(1,recvoffset+k),k,1.0d0,a(offset,1),lda)
-
-end subroutine qr_pdlarft_pdlarfb_1dcomm
-
-subroutine qr_pdlarft_set_merge_1dcomm(m,mb,n,blocksize,v,ldv,t,ldt,baseidx,rev,mpicomm,work,lwork)
-    use precision
-    use qr_utils_mod
-
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik)  :: ldv,ldt,lwork
-    real(kind=rk)     :: v(ldv,*),t(ldt,*),work(n,*)
-
-    ! input variables (global)
-    integer(kind=ik)  :: m,mb,n,blocksize,baseidx,rev,mpicomm
-
-    ! output variables (global)
-
-    ! derived input variables from QR_PQRPARAM
-
-    ! local scalars
-    integer(kind=ik)  :: localsize,offset,baseoffset
-    integer(kind=ik)  :: mpirank,mpiprocs,mpierr
-
-    if (lwork .eq. -1) then
-        work(1,1) = DBLE(2*n*n)
-        return
-    end if
-
-    call MPI_Comm_rank(mpicomm,mpirank,mpierr)
-    call MPI_Comm_size(mpicomm,mpiprocs,mpierr)
-
-    call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
-                                localsize,baseoffset,offset)
-
-    if (localsize .gt. 0) then
-        call dsyrk("Upper","Trans",n,localsize,1.0d0,v(baseoffset,1),ldv,0.0d0,work(1,1),n)
-    else
-        work(1:n,1:n) = 0.0d0
-    end if
-
-    call mpi_allreduce(work(1,1),work(1,n+1),n*n,mpi_real8,mpi_sum,mpicomm,mpierr)
-
-        ! skip Y4'*Y4 part
-        offset = mod(n,blocksize)
-        if (offset .eq. 0) offset=blocksize
-        call qr_tmerge_set_kernel(n,blocksize,t,ldt,work(1,n+1+offset),n)
-
-end subroutine qr_pdlarft_set_merge_1dcomm
-
-subroutine qr_pdlarft_tree_merge_1dcomm(m,mb,n,blocksize,treeorder,v,ldv,t,ldt,baseidx,rev,mpicomm,work,lwork)
-    use precision
-    use qr_utils_mod
-
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik) :: ldv,ldt,lwork
-    real(kind=rk)    :: v(ldv,*),t(ldt,*),work(n,*)
-
-    ! input variables (global)
-    integer(kind=ik) :: m,mb,n,blocksize,treeorder,baseidx,rev,mpicomm
-
-    ! output variables (global)
-
-    ! derived input variables from QR_PQRPARAM
-
-    ! local scalars
-    integer(kind=ik) :: localsize,offset,baseoffset
-    integer(kind=ik) :: mpirank,mpiprocs,mpierr
-
-    if (lwork .eq. -1) then
-        work(1,1) = DBLE(2*n*n)
-        return
-    end if
-
-    if (n .le. blocksize) return ! nothing to do
-
-    call MPI_Comm_rank(mpicomm,mpirank,mpierr)
-    call MPI_Comm_size(mpicomm,mpiprocs,mpierr)
-
-    call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
-                                localsize,baseoffset,offset)
-
-    if (localsize .gt. 0) then
-        call dsyrk("Upper","Trans",n,localsize,1.0d0,v(baseoffset,1),ldv,0.0d0,work(1,1),n)
-    else
-        work(1:n,1:n) = 0.0d0
-    end if
-
-    call mpi_allreduce(work(1,1),work(1,n+1),n*n,mpi_real8,mpi_sum,mpicomm,mpierr)
-
-        ! skip Y4'*Y4 part
-        offset = mod(n,blocksize)
-        if (offset .eq. 0) offset=blocksize
-        call qr_tmerge_tree_kernel(n,blocksize,treeorder,t,ldt,work(1,n+1+offset),n)
-
-end subroutine qr_pdlarft_tree_merge_1dcomm
-
-! apply householder vector to the left
-! - assume unitary matrix
-! - assume right positions for v
-subroutine qr_pdlarfl_1dcomm(v,incv,baseidx,a,lda,tau,work,lwork,m,n,idx,mb,rev,mpicomm)
-    use precision
-    use ELPA1
-    use qr_utils_mod
-
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik) :: incv,lda,lwork,baseidx
-    real(kind=rk)    :: v(*),a(lda,*),work(*)
-
-    ! input variables (global)
-    integer(kind=ik) :: m,n,mb,rev,idx,mpicomm
-    real(kind=rk)    :: tau
-
-    ! output variables (global)
-
-    ! local scalars
-    integer(kind=ik) :: mpierr,mpirank,mpiprocs
-    integer(kind=ik) :: sendsize,recvsize,icol
-    integer(kind=ik) :: local_size,local_offset
-    integer(kind=ik) :: v_local_offset
-
-    ! external functions
-    real(kind=rk), external :: ddot
-
-    call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-    call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-
-    sendsize = n
-    recvsize = sendsize
-
-    if (lwork .eq. -1) then
-        work(1) = DBLE(sendsize + recvsize)
-        return
-    end if
-
-    if (n .le. 0) return
-
-        if (idx .le. 1) return
-
-    call local_size_offset_1d(m,mb,baseidx,idx,rev,mpirank,mpiprocs, &
-                              local_size,v_local_offset,local_offset)
-
-    !print *,'hl ref',local_size,n
-
-    v_local_offset = v_local_offset * incv
-
-    if (local_size > 0) then
-
-        do icol=1,n
-            work(icol) = dot_product(v(v_local_offset:v_local_offset+local_size-1),a(local_offset:local_offset+local_size-1,icol))
-
-        end do
-    else
-        work(1:n) = 0.0d0
-    end if
-
-    call mpi_allreduce(work, work(sendsize+1), sendsize, mpi_real8, mpi_sum, mpicomm, mpierr)
-
-    if (local_size > 0) then
-
-         do icol=1,n
-               a(local_offset:local_offset+local_size-1,icol) = a(local_offset:local_offset+local_size-1,icol) &
-                                                                - tau*work(sendsize+icol)*v(v_local_offset:v_local_offset+ &
-                                                                           local_size-1)
-         enddo
-    end if
-
-end subroutine qr_pdlarfl_1dcomm
-
-subroutine qr_pdlarfl2_tmatrix_1dcomm(v,ldv,baseidx,a,lda,t,ldt,work,lwork,m,n,idx,mb,rev,mpicomm)
-    use precision
-    use ELPA1
-    use qr_utils_mod
-
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik) :: ldv,lda,lwork,baseidx,ldt
-    real(kind=rk)    :: v(ldv,*),a(lda,*),work(*),t(ldt,*)
-
-    ! input variables (global)
-    integer(kind=ik) :: m,n,mb,rev,idx,mpicomm
-
-    ! output variables (global)
-
-    ! local scalars
-    integer(kind=ik) :: mpierr,mpirank,mpiprocs,mpirank_top1,mpirank_top2
-    integer(kind=ik) :: dgemv1_offset,dgemv2_offset
-    integer(kind=ik) :: sendsize, recvsize
-    integer(kind=ik) :: local_size1,local_offset1
-    integer(kind=ik) :: local_size2,local_offset2
-    integer(kind=ik) :: local_size_dger,local_offset_dger
-    integer(kind=ik) :: v1_local_offset,v2_local_offset
-    integer(kind=ik) :: v_local_offset_dger
-    real(kind=rk)    :: hvdot
-    integer(kind=ik) :: irow,icol,v1col,v2col
-
-    ! external functions
-    real(kind=rk), external :: ddot
-
-    call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-    call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-
-    sendsize = 2*n
-    recvsize = sendsize
-
-    if (lwork .eq. -1) then
-        work(1) = sendsize + recvsize
-        return
-    end if
-
-    dgemv1_offset = 1
-    dgemv2_offset = dgemv1_offset + n
-
-        ! in 2x2 matrix case only one householder vector was generated
-        if (idx .le. 2) then
-            call qr_pdlarfl_1dcomm(v(1,2),1,baseidx,a,lda,t(2,2), &
-                                    work,lwork,m,n,idx,mb,rev,mpicomm)
-            return
-        end if
-
-        call local_size_offset_1d(m,mb,baseidx,idx,rev,mpirank,mpiprocs, &
-                                  local_size1,v1_local_offset,local_offset1)
-        call local_size_offset_1d(m,mb,baseidx,idx-1,rev,mpirank,mpiprocs, &
-                                  local_size2,v2_local_offset,local_offset2)
-
-        v1_local_offset = v1_local_offset * 1
-        v2_local_offset = v2_local_offset * 1
-
-        v1col = 2
-        v2col = 1
-
-        ! keep buffers clean in case that local_size1/local_size2 are zero
-        work(1:sendsize) = 0.0d0
-
-        call dgemv("Trans",local_size1,n,1.0d0,a(local_offset1,1),lda,v(v1_local_offset,v1col),1,0.0d0,work(dgemv1_offset),1)
-        call dgemv("Trans",local_size2,n,t(v2col,v2col),a(local_offset2,1),lda,v(v2_local_offset,v2col),1,0.0d0, &
-                   work(dgemv2_offset),1)
-
-        call mpi_allreduce(work, work(sendsize+1), sendsize, mpi_real8, mpi_sum, mpicomm, mpierr)
-
-        ! update second vector
-        call daxpy(n,t(1,2),work(sendsize+dgemv1_offset),1,work(sendsize+dgemv2_offset),1)
-
-        call local_size_offset_1d(m,mb,baseidx,idx-2,rev,mpirank,mpiprocs, &
-                                  local_size_dger,v_local_offset_dger,local_offset_dger)
-
-        ! get ranks of processes with topelements
-        mpirank_top1 = MOD((idx-1)/mb,mpiprocs)
-        mpirank_top2 = MOD((idx-2)/mb,mpiprocs)
-
-        if (mpirank_top1 .eq. mpirank) local_offset1 = local_size1
-        if (mpirank_top2 .eq. mpirank) then
-            local_offset2 = local_size2
-            v2_local_offset = local_size2
-        end if
-
-    ! use hvdot as temporary variable
-    hvdot = t(v1col,v1col)
-    do icol=1,n
-        ! make use of "1" entries in householder vectors
-        if (mpirank_top1 .eq. mpirank) then
-            a(local_offset1,icol) = a(local_offset1,icol) &
-                                    - work(sendsize+dgemv1_offset+icol-1)*hvdot
-        end if
-
-        if (mpirank_top2 .eq. mpirank) then
-            a(local_offset2,icol) = a(local_offset2,icol) &
-                                    - v(v2_local_offset,v1col)*work(sendsize+dgemv1_offset+icol-1)*hvdot &
-                                    - work(sendsize+dgemv2_offset+icol-1)
-        end if
-
-        do irow=1,local_size_dger
-            a(local_offset_dger+irow-1,icol) = a(local_offset_dger+irow-1,icol) &
-                                    - work(sendsize+dgemv1_offset+icol-1)*v(v_local_offset_dger+irow-1,v1col)*hvdot &
-                                    - work(sendsize+dgemv2_offset+icol-1)*v(v_local_offset_dger+irow-1,v2col)
-        end do
-    end do
-
-end subroutine qr_pdlarfl2_tmatrix_1dcomm
-
-! generalized pdlarfl2 version
-! TODO: include T merge here (seperate by "old" and "new" index)
-subroutine qr_tmerge_pdlarfb_1dcomm(m,mb,n,oldk,k,v,ldv,t,ldt,a,lda,baseidx,rev,updatemode,mpicomm,work,lwork)
-    use precision
-    use qr_utils_mod
-
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik) :: ldv,ldt,lda,lwork
-    real(kind=rk)    :: v(ldv,*),t(ldt,*),work(*),a(lda,*)
-
-    ! input variables (global)
-    integer(kind=ik) :: m,mb,n,k,oldk,baseidx,rev,updatemode,mpicomm
-
-    ! output variables (global)
-
-    ! derived input variables from QR_PQRPARAM
-
-    ! local scalars
-    integer(kind=ik) :: localsize,offset,baseoffset
-    integer(kind=ik) :: mpirank,mpiprocs,mpierr
-
-    integer(kind=ik) :: sendoffset,recvoffset,sendsize
-    integer(kind=ik) :: updateoffset,updatelda,updatesize
-    integer(kind=ik) :: mergeoffset,mergelda,mergesize
-    integer(kind=ik) :: tgenoffset,tgenlda,tgensize
-
-        if (updatemode .eq. ichar('I')) then
-            updatelda = oldk+k
-        else
-            updatelda = k
-        end if
-
-        updatesize = updatelda*n
-
-        mergelda = k
-        mergesize = mergelda*oldk
-
-        tgenlda = 0
-        tgensize = 0
-
-        sendsize = updatesize + mergesize + tgensize
-
-    if (lwork .eq. -1) then
-        work(1) = DBLE(2*sendsize)
-        return
-    end if
-
-    call MPI_Comm_rank(mpicomm,mpirank,mpierr)
-    call MPI_Comm_size(mpicomm,mpiprocs,mpierr)
-
-    ! use baseidx as idx here, otherwise the upper triangle part will be lost
-    ! during the calculation, especially in the reversed case
-    call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
-                                localsize,baseoffset,offset)
-
-    sendoffset = 1
-
-        if (oldk .gt. 0) then
-            updateoffset = 0
-            mergeoffset = updateoffset + updatesize
-            tgenoffset = mergeoffset + mergesize
-
-            sendsize = updatesize + mergesize + tgensize
-
-            !print *,'sendsize',sendsize,updatesize,mergesize,tgensize
-            !print *,'merging nr of rotations', oldk+k
-
-            if (localsize .gt. 0) then
-                ! calculate matrix matrix product of householder vectors and target matrix
-
-                if (updatemode .eq. ichar('I')) then
-                    ! Z' = (Y1,Y2)' * A
-                    call dgemm("Trans","Notrans",k+oldk,n,localsize,1.0d0,v(baseoffset,1),ldv,a(offset,1),lda,0.0d0, &
-                               work(sendoffset+updateoffset),updatelda)
-                else
-                    ! Z' = Y1' * A
-                    call dgemm("Trans","Notrans",k,n,localsize,1.0d0,v(baseoffset,1),ldv,a(offset,1),lda,0.0d0, &
-                               work(sendoffset+updateoffset),updatelda)
-                end if
-
-                ! calculate parts needed for T merge
-                call dgemm("Trans","Notrans",k,oldk,localsize,1.0d0,v(baseoffset,1),ldv,v(baseoffset,k+1),ldv,0.0d0, &
-                           work(sendoffset+mergeoffset),mergelda)
-
-            else
-                ! cleanup buffer
-                work(sendoffset:sendoffset+sendsize-1) = 0.0d0
-            end if
-        else
-            ! do not calculate parts for T merge as there is nothing to merge
-
-            updateoffset = 0
-
-            tgenoffset = updateoffset + updatesize
-
-            sendsize = updatesize + tgensize
-
-            if (localsize .gt. 0) then
-                ! calculate matrix matrix product of householder vectors and target matrix
-                ! Z' = (Y1)' * A
-                call dgemm("Trans","Notrans",k,n,localsize,1.0d0,v(baseoffset,1),ldv,a(offset,1),lda,0.0d0, &
-                           work(sendoffset+updateoffset),updatelda)
-
-            else
-                ! cleanup buffer
-                work(sendoffset:sendoffset+sendsize-1) = 0.0d0
-            end if
-
-        end if
-
-    recvoffset = sendoffset + sendsize
-
-    if (sendsize .le. 0) return ! nothing to do
-
-    ! exchange data
-    call mpi_allreduce(work(sendoffset),work(recvoffset),sendsize,mpi_real8,mpi_sum,mpicomm,mpierr)
-
-    updateoffset = recvoffset+updateoffset
-    mergeoffset = recvoffset+mergeoffset
-    tgenoffset = recvoffset+tgenoffset
-
-        if (oldk .gt. 0) then
-            call qr_pdlarft_merge_kernel_local(oldk,k,t,ldt,work(mergeoffset),mergelda)
-
-            if (localsize .gt. 0) then
-                if (updatemode .eq. ichar('I')) then
-
-                    ! update matrix (pdlarfb) with complete T
-                    call qr_pdlarfb_kernel_local(localsize,n,k+oldk,a(offset,1),lda,v(baseoffset,1),ldv,t(1,1),ldt, &
-                                                 work(updateoffset),updatelda)
-                else
-                    ! update matrix (pdlarfb) with small T (same as update with no old T TODO)
-                    call qr_pdlarfb_kernel_local(localsize,n,k,a(offset,1),lda,v(baseoffset,1),ldv,t(1,1),ldt, &
-                                                 work(updateoffset),updatelda)
-                end if
-            end if
-        else
-            if (localsize .gt. 0) then
-                ! update matrix (pdlarfb) with small T
-                call qr_pdlarfb_kernel_local(localsize,n,k,a(offset,1),lda,v(baseoffset,1),ldv,t(1,1),ldt, &
-                                             work(updateoffset),updatelda)
-            end if
-        end if
-
-end subroutine qr_tmerge_pdlarfb_1dcomm
-
-end module elpa_pdlarfb
diff --git a/src/elpa_qr/elpa_qrkernels.f90 b/src/elpa_qr/elpa_qrkernels.f90
deleted file mode 100644
index 1ffd3721a..000000000
--- a/src/elpa_qr/elpa_qrkernels.f90
+++ /dev/null
@@ -1,783 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! calculates A = A - Y*T'*Z (rev=0)
-! calculates A = A - Y*T*Z (rev=1)
-! T upper triangle matrix
-! assuming zero entries in matrix in upper kxk block
-subroutine qr_pdlarfb_kernel_local(m,n,k,a,lda,v,ldv,t,ldt,z,ldz)
-    use precision
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik) :: lda,ldv,ldt,ldz
-    real(kind=rk)    :: a(lda,*),v(ldv,*),t(ldt,*),z(ldz,*)
-
-    ! input variables (global)
-    integer(kind=ik) :: m,n,k
-
-    ! local variables
-    real(kind=rk)    :: t11
-    real(kind=rk)    :: t12,t22,sum1,sum2
-    real(kind=rk)    :: t13,t23,t33,sum3
-    real(kind=rk)    :: sum4,t44
-    real(kind=rk)    :: y1,y2,y3,y4
-    real(kind=rk)    :: a1
-    integer(kind=ik) :: icol,irow,v1col,v2col,v3col
-
-    ! reference implementation
-    if (k .eq. 1) then
-        t11 = t(1,1)
-        do icol=1,n
-            sum1 = z(1,icol)
-            a(1:m,icol) = a(1:m,icol) - t11*sum1*v(1:m,1)
-        enddo
-        return
-    else if (k .eq. 2) then
-            v1col = 2
-            v2col = 1
-            t22 = t(1,1)
-            t12 = t(1,2)
-            t11 = t(2,2)
-
-        do icol=1,n
-            sum1 = t11 * z(v1col,icol)
-            sum2 = t12 * z(v1col,icol) + t22 * z(v2col,icol)
-
-            do irow=1,m
-                a(irow,icol) = a(irow,icol) - v(irow,v1col) * sum1 - v(irow,v2col) * sum2
-            end do
-        end do
-    else if (k .eq. 3) then
-            v1col = 3
-            v2col = 2
-            v3col = 1
-
-            t33 = t(1,1)
-
-            t23 = t(1,2)
-            t22 = t(2,2)
-
-            t13 = t(1,3)
-            t12 = t(2,3)
-            t11 = t(3,3)
-
-        do icol=1,n
-            ! misusing variables for fetch of z parts
-            y1 = z(v1col,icol)
-            y2 = z(v2col,icol)
-            y3 = z(v3col,icol)
-
-            sum1 = t11 * y1!+ 0   * y2!+ 0   * y3
-            sum2 = t12 * y1 + t22 * y2!+ 0   * y3
-            sum3 = t13 * y1 + t23 * y2 + t33 * y3
-
-            do irow=1,m
-                a(irow,icol) = a(irow,icol) - v(irow,v1col) * sum1 - v(irow,v2col) * sum2 - v(irow,v3col) * sum3
-            end do
-        end do
-    else if (k .eq. 4) then
-            do icol=1,n
-                ! misusing variables for fetch of z parts
-                y1 = z(1,icol)
-                y2 = z(2,icol)
-                y3 = z(3,icol)
-                y4 = z(4,icol)
-
-                ! dtrmv like - starting from main diagonal and working
-                ! upwards
-                t11 = t(1,1)
-                t22 = t(2,2)
-                t33 = t(3,3)
-                t44 = t(4,4)
-
-                sum1 = t11 * y1
-                sum2 = t22 * y2
-                sum3 = t33 * y3
-                sum4 = t44 * y4
-
-                t11 = t(1,2)
-                t22 = t(2,3)
-                t33 = t(3,4)
-
-                sum1 = sum1 + t11 * y2
-                sum2 = sum2 + t22 * y3
-                sum3 = sum3 + t33 * y4
-
-                t11 = t(1,3)
-                t22 = t(2,4)
-
-                sum1 = sum1 + t11 * y3
-                sum2 = sum2 + t22 * y4
-
-                t11 = t(1,4)
-                sum1 = sum1 + t11 * y4
-
-                ! one column of V is calculated
-                ! time to calculate A - Y * V
-                do irow=1,m ! TODO: loop unrolling
-                    y1 = v(irow,1)
-                    y2 = v(irow,2)
-                    y3 = v(irow,3)
-                    y4 = v(irow,4)
-
-                    a1 = a(irow,icol)
-
-                    a1 = a1 - y1*sum1
-                    a1 = a1 - y2*sum2
-                    a1 = a1 - y3*sum3
-                    a1 = a1 - y4*sum4
-
-                    a(irow,icol) = a1
-                end do
-            end do
-    else
-        ! reference implementation
-            ! V' = T * Z'
-            call dtrmm("Left","Upper","Notrans","Nonunit",k,n,1.0d0,t,ldt,z,ldz)
-            ! A = A - Y * V'
-            call dgemm("Notrans","Notrans",m,n,k,-1.0d0,v,ldv,z,ldz,1.0d0,a,lda)
-    end if
-
-end subroutine
-subroutine qr_pdlarft_merge_kernel_local(oldk,k,t,ldt,yty,ldy)
-    use precision
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik) :: ldt,ldy
-    real(kind=rk)    :: t(ldt,*),yty(ldy,*)
-
-    ! input variables (global)
-    integer(kind=ik) :: k,oldk
-
-    ! output variables (global)
-
-    ! local scalars
-    integer(kind=ik) :: icol,leftk,rightk
-
-    ! local scalars for optimized versions
-    integer(kind=ik) :: irow
-    real(kind=rk)    :: t11
-    real(kind=rk)    :: yty1,yty2,yty3,yty4,yty5,yty6,yty7,yty8
-    real(kind=rk)    :: reg01,reg02,reg03,reg04,reg05,reg06,reg07,reg08
-    real(kind=rk)    :: final01,final02,final03,final04,final05,final06,final07,final08
-
-    if (oldk .eq. 0) return ! nothing to be done
-
-        leftk = k
-        rightk = oldk
-
-    ! optimized implementations:
-    if (leftk .eq. 1) then
-        do icol=1,rightk
-            ! multiply inner products with right t matrix
-            ! (dtrmv like)
-            yty1 = yty(1,1)
-            t11 = t(leftk+1,leftk+icol)
-
-            reg01 = yty1 * t11
-
-            do irow=2,icol
-                yty1 = yty(1,irow)
-                t11 = t(leftk+irow,leftk+icol)
-
-                reg01 = reg01 + yty1 * t11
-            end do
-
-            ! multiply intermediate results with left t matrix and store in final t
-            ! matrix
-            t11 = -t(1,1)
-            final01 = t11 * reg01
-            t(1,leftk+icol) = final01
-        end do
-
-        !print *,'efficient tmerge - leftk=1'
-    else if (leftk .eq. 2) then
-        do icol=1,rightk
-            ! multiply inner products with right t matrix
-            ! (dtrmv like)
-            yty1 = yty(1,1)
-            yty2 = yty(2,1)
-
-            t11  = t(leftk+1,leftk+icol)
-
-            reg01 = yty1 * t11
-            reg02 = yty2 * t11
-
-            do irow=2,icol
-                yty1 = yty(1,irow)
-                yty2 = yty(2,irow)
-                t11 = t(leftk+irow,leftk+icol)
-
-                reg01 = reg01 + yty1 * t11
-                reg02 = reg02 + yty2 * t11
-            end do
-
-            ! multiply intermediate results with left t matrix and store in final t
-            ! matrix
-            yty1 = -t(1,1)
-            yty2 = -t(1,2)
-            yty3 = -t(2,2)
-
-            final01 = reg02 * yty2
-            final02 = reg02 * yty3
-
-            final01 = final01 + reg01 * yty1
-
-            t(1,leftk+icol) = final01
-            t(2,leftk+icol) = final02
-        end do
-
-        !print *,'efficient tmerge - leftk=2'
-    else if (leftk .eq. 4) then
-        do icol=1,rightk
-            ! multiply inner products with right t matrix
-            ! (dtrmv like)
-            yty1 = yty(1,1)
-            yty2 = yty(2,1)
-            yty3 = yty(3,1)
-            yty4 = yty(4,1)
-
-            t11  = t(leftk+1,leftk+icol)
-
-            reg01 = yty1 * t11
-            reg02 = yty2 * t11
-            reg03 = yty3 * t11
-            reg04 = yty4 * t11
-
-            do irow=2,icol
-                yty1 = yty(1,irow)
-                yty2 = yty(2,irow)
-                yty3 = yty(3,irow)
-                yty4 = yty(4,irow)
-
-                t11 = t(leftk+irow,leftk+icol)
-
-                reg01 = reg01 + yty1 * t11
-                reg02 = reg02 + yty2 * t11
-                reg03 = reg03 + yty3 * t11
-                reg04 = reg04 + yty4 * t11
-            end do
-
-            ! multiply intermediate results with left t matrix and store in final t
-            ! matrix (start from diagonal and move upwards)
-            yty1 = -t(1,1)
-            yty2 = -t(2,2)
-            yty3 = -t(3,3)
-            yty4 = -t(4,4)
-
-            ! main diagonal
-            final01 = reg01 * yty1
-            final02 = reg02 * yty2
-            final03 = reg03 * yty3
-            final04 = reg04 * yty4
-
-            ! above main diagonal
-            yty1 = -t(1,2)
-            yty2 = -t(2,3)
-            yty3 = -t(3,4)
-
-            final01 = final01 + reg02 * yty1
-            final02 = final02 + reg03 * yty2
-            final03 = final03 + reg04 * yty3
-
-            ! above first side diagonal
-            yty1 = -t(1,3)
-            yty2 = -t(2,4)
-
-            final01 = final01 + reg03 * yty1
-            final02 = final02 + reg04 * yty2
-
-            ! above second side diagonal
-            yty1 = -t(1,4)
-
-            final01 = final01 + reg04 * yty1
-
-            ! write back to final matrix
-            t(1,leftk+icol) = final01
-            t(2,leftk+icol) = final02
-            t(3,leftk+icol) = final03
-            t(4,leftk+icol) = final04
-        end do
-
-        !print *,'efficient tmerge - leftk=4'
-    else if (leftk .eq. 8) then
-        do icol=1,rightk
-            ! multiply inner products with right t matrix
-            ! (dtrmv like)
-            yty1 = yty(1,1)
-            yty2 = yty(2,1)
-            yty3 = yty(3,1)
-            yty4 = yty(4,1)
-            yty5 = yty(5,1)
-            yty6 = yty(6,1)
-            yty7 = yty(7,1)
-            yty8 = yty(8,1)
-
-            t11  = t(leftk+1,leftk+icol)
-
-            reg01 = yty1 * t11
-            reg02 = yty2 * t11
-            reg03 = yty3 * t11
-            reg04 = yty4 * t11
-            reg05 = yty5 * t11
-            reg06 = yty6 * t11
-            reg07 = yty7 * t11
-            reg08 = yty8 * t11
-
-            do irow=2,icol
-                yty1 = yty(1,irow)
-                yty2 = yty(2,irow)
-                yty3 = yty(3,irow)
-                yty4 = yty(4,irow)
-                yty5 = yty(5,irow)
-                yty6 = yty(6,irow)
-                yty7 = yty(7,irow)
-                yty8 = yty(8,irow)
-
-                t11 = t(leftk+irow,leftk+icol)
-
-                reg01 = reg01 + yty1 * t11
-                reg02 = reg02 + yty2 * t11
-                reg03 = reg03 + yty3 * t11
-                reg04 = reg04 + yty4 * t11
-                reg05 = reg05 + yty5 * t11
-                reg06 = reg06 + yty6 * t11
-                reg07 = reg07 + yty7 * t11
-                reg08 = reg08 + yty8 * t11
-            end do
-
-            ! multiply intermediate results with left t matrix and store in final t
-            ! matrix (start from diagonal and move upwards)
-            yty1 = -t(1,1)
-            yty2 = -t(2,2)
-            yty3 = -t(3,3)
-            yty4 = -t(4,4)
-            yty5 = -t(5,5)
-            yty6 = -t(6,6)
-            yty7 = -t(7,7)
-            yty8 = -t(8,8)
-
-            ! main diagonal
-            final01 = reg01 * yty1
-            final02 = reg02 * yty2
-            final03 = reg03 * yty3
-            final04 = reg04 * yty4
-            final05 = reg05 * yty5
-            final06 = reg06 * yty6
-            final07 = reg07 * yty7
-            final08 = reg08 * yty8
-
-            ! above main diagonal
-            yty1 = -t(1,2)
-            yty2 = -t(2,3)
-            yty3 = -t(3,4)
-            yty4 = -t(4,5)
-            yty5 = -t(5,6)
-            yty6 = -t(6,7)
-            yty7 = -t(7,8)
-
-            final01 = final01 + reg02 * yty1
-            final02 = final02 + reg03 * yty2
-            final03 = final03 + reg04 * yty3
-            final04 = final04 + reg05 * yty4
-            final05 = final05 + reg06 * yty5
-            final06 = final06 + reg07 * yty6
-            final07 = final07 + reg08 * yty7
-
-            ! above first side diagonal
-            yty1 = -t(1,3)
-            yty2 = -t(2,4)
-            yty3 = -t(3,5)
-            yty4 = -t(4,6)
-            yty5 = -t(5,7)
-            yty6 = -t(6,8)
-
-            final01 = final01 + reg03 * yty1
-            final02 = final02 + reg04 * yty2
-            final03 = final03 + reg05 * yty3
-            final04 = final04 + reg06 * yty4
-            final05 = final05 + reg07 * yty5
-            final06 = final06 + reg08 * yty6
-
-            !above second side diagonal
-
-            yty1 = -t(1,4)
-            yty2 = -t(2,5)
-            yty3 = -t(3,6)
-            yty4 = -t(4,7)
-            yty5 = -t(5,8)
-
-            final01 = final01 + reg04 * yty1
-            final02 = final02 + reg05 * yty2
-            final03 = final03 + reg06 * yty3
-            final04 = final04 + reg07 * yty4
-            final05 = final05 + reg08 * yty5
-
-            ! i think you got the idea by now
-
-            yty1 = -t(1,5)
-            yty2 = -t(2,6)
-            yty3 = -t(3,7)
-            yty4 = -t(4,8)
-
-            final01 = final01 + reg05 * yty1
-            final02 = final02 + reg06 * yty2
-            final03 = final03 + reg07 * yty3
-            final04 = final04 + reg08 * yty4
-
-            ! .....
-
-            yty1 = -t(1,6)
-            yty2 = -t(2,7)
-            yty3 = -t(3,8)
-
-            final01 = final01 + reg06 * yty1
-            final02 = final02 + reg07 * yty2
-            final03 = final03 + reg08 * yty3
-
-            ! .....
-
-            yty1 = -t(1,7)
-            yty2 = -t(2,8)
-
-            final01 = final01 + reg07 * yty1
-            final02 = final02 + reg08 * yty2
-
-            ! .....
-
-            yty1 = -t(1,8)
-
-            final01 = final01 + reg08 * yty1
-
-            ! write back to final matrix
-            t(1,leftk+icol) = final01
-            t(2,leftk+icol) = final02
-            t(3,leftk+icol) = final03
-            t(4,leftk+icol) = final04
-            t(5,leftk+icol) = final05
-            t(6,leftk+icol) = final06
-            t(7,leftk+icol) = final07
-            t(8,leftk+icol) = final08
-        end do
-
-        !print *,'efficient tmerge - leftk=8'
-    else
-        ! reference implementation
-        do icol=1,rightk
-            t(1:leftk,leftk+icol) = yty(1:leftk,icol)
-        end do
-
-        ! -T1 * Y1'*Y2
-        call dtrmm("Left","Upper","Notrans","Nonunit",leftk,rightk,-1.0d0,t(1,1),ldt,t(1,leftk+1),ldt)
-        ! (-T1 * Y1'*Y2) * T2
-        call dtrmm("Right","Upper","Notrans","Nonunit",leftk,rightk,1.0d0,t(leftk+1,leftk+1),ldt,t(1,leftk+1),ldt)
-    end if
-
-end subroutine
-! yty structure
-! Y1'*Y2   Y1'*Y3  Y1'*Y4 ...
-!    0     Y2'*Y3  Y2'*Y4 ...
-!    0        0    Y3'*Y4 ...
-!    0        0       0   ...
-subroutine qr_tmerge_set_kernel(k,blocksize,t,ldt,yty,ldy)
-    use precision
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik) :: ldt,ldy
-    real(kind=rk)    :: t(ldt,*),yty(ldy,*)
-
-    ! input variables (global)
-    integer(kind=ik) :: k,blocksize
-
-    ! output variables (global)
-
-    ! local scalars
-    integer(kind=ik) :: nr_blocks,current_block
-    integer(kind=ik) :: remainder,oldk
-    integer(kind=ik) :: yty_column,toffset
-
-    if (k .le. blocksize) return ! nothing to merge
-
-    nr_blocks = k / blocksize
-    remainder = k - nr_blocks*blocksize
-
-        ! work in "negative" direction:
-        ! start with latest T matrix part and add older ones
-        toffset = 1
-        yty_column = 1
-
-        if (remainder .gt. 0) then
-            call qr_pdlarft_merge_kernel_local(blocksize,remainder,t(toffset,toffset),ldt,yty(1,yty_column),ldy)
-            current_block = 1
-            oldk = remainder+blocksize
-            yty_column =  yty_column + blocksize
-        else
-            call qr_pdlarft_merge_kernel_local(blocksize,blocksize,t(toffset,toffset),ldt,yty(1,yty_column),ldy)
-            current_block = 2
-            oldk = 2*blocksize
-            yty_column = yty_column + blocksize
-        end if
-
-        do while (current_block .lt. nr_blocks)
-            call qr_pdlarft_merge_kernel_local(blocksize,oldk,t(toffset,toffset),ldt,yty(toffset,yty_column),ldy)
-
-            current_block = current_block + 1
-            oldk = oldk + blocksize
-            yty_column = yty_column + blocksize
-        end do
-
-end subroutine
-! yty structure
-! Y1'*Y2   Y1'*Y3  Y1'*Y4 ...
-!    0     Y2'*Y3  Y2'*Y4 ...
-!    0        0    Y3'*Y4 ...
-!    0        0       0   ...
-
-subroutine qr_tmerge_tree_kernel(k,blocksize,treeorder,t,ldt,yty,ldy)
-    use precision
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik) :: ldt,ldy
-    real(kind=rk)    :: t(ldt,*),yty(ldy,*)
-
-    ! input variables (global)
-    integer(kind=ik) :: k,blocksize,treeorder
-
-    ! output variables (global)
-
-    ! local scalars
-    integer temp_blocksize,nr_sets,current_set,setsize,nr_blocks
-    integer remainder,max_treeorder,remaining_size
-    integer toffset,yty_column
-    integer toffset_start,yty_column_start
-    integer yty_end,total_remainder,yty_remainder
-
-    if (treeorder .eq. 0) return ! no merging
-
-    if (treeorder .eq. 1) then
-        call qr_tmerge_set_kernel(k,blocksize,t,ldt,yty,ldy)
-        return
-    end if
-
-    nr_blocks = k / blocksize
-    max_treeorder = min(nr_blocks,treeorder)
-
-    if (max_treeorder .eq. 1) then
-        call qr_tmerge_set_kernel(k,blocksize,t,ldt,yty,ldy)
-        return
-    end if
-
-        ! work in "negative" direction: from latest set to oldest set
-        ! implementation differs from rev=0 version due to issues with
-        ! calculating the remainder parts
-        ! compared to the rev=0 version we split remainder parts directly from
-        ! parts which can be easily merged in a recursive way
-
-        yty_end = (k / blocksize) * blocksize
-        if (yty_end .eq. k) then
-            yty_end = yty_end - blocksize
-        end if
-
-        !print *,'tree',yty_end,k,blocksize
-
-        yty_column_start = 1
-        toffset_start = 1
-
-        ! is there a remainder block?
-        nr_blocks = k / blocksize
-        remainder = k - nr_blocks * blocksize
-        if (remainder .eq. 0) then
-            !print *,'no initial remainder'
-
-            ! set offsets to the very beginning as there is no remainder part
-            yty_column_start = 1
-            toffset_start = 1
-            total_remainder = 0
-            remaining_size = k
-            yty_remainder = 0
-        else
-            !print *,'starting with initial remainder'
-            ! select submatrix and make remainder block public
-            yty_column_start = 1 + blocksize
-            toffset_start = 1 + remainder
-            total_remainder = remainder
-            remaining_size = k - remainder
-            yty_remainder = 1
-        end if
-
-        ! from now on it is a clean set of blocks with sizes of multiple of
-        ! blocksize
-
-        temp_blocksize = blocksize
-
-        !-------------------------------
-        do while (remaining_size .gt. 0)
-            nr_blocks = remaining_size / temp_blocksize
-            max_treeorder = min(nr_blocks,treeorder)
-
-            if (max_treeorder .eq. 1) then
-                remainder = 0
-                nr_sets = 0
-                setsize = 0
-
-                if (yty_remainder .gt. 0) then
-                    yty_column = yty_remainder
-                    !print *,'final merging with remainder',temp_blocksize,k,remaining_size,yty_column
-                    call qr_tmerge_set_kernel(k,temp_blocksize,t,ldt,yty(1,yty_column),ldy)
-                else
-                    !print *,'no remainder - no merging needed',temp_blocksize,k,remaining_size
-                endif
-
-                remaining_size = 0
-
-                return ! done
-            else
-                nr_sets = nr_blocks / max_treeorder
-                setsize = max_treeorder*temp_blocksize
-                remainder = remaining_size - nr_sets*setsize
-            end if
-
-            if (remainder .gt. 0) then
-                if (remainder .gt. temp_blocksize) then
-                    toffset = toffset_start
-                    yty_column = yty_column_start
-
-                    !print *,'set merging', toffset, yty_column,remainder
-                    call qr_tmerge_set_kernel(remainder,temp_blocksize,t(toffset,toffset),ldt,yty(toffset,yty_column),ldy)
-
-                    if (total_remainder .gt. 0) then
-                        ! merge with existing global remainder part
-                        !print *,'single+set merging',yty_remainder,total_remainder,remainder
-
-                        call qr_pdlarft_merge_kernel_local(remainder,total_remainder,t(1,1),ldt,yty(1,yty_remainder),ldy)
-
-                        yty_remainder = yty_remainder + remainder
-                        toffset_start = toffset_start + remainder
-
-                        !print *,'single+set merging (new offsets)',yty_remainder,yty_column_start,toffset_start
-
-                        yty_column_start = yty_column_start + remainder
-                    else
-                        ! create new remainder part
-                        !print *,'new remainder+set',yty_remainder
-                        yty_remainder = yty_column_start + remainder - temp_blocksize
-                        yty_column_start = yty_column_start + remainder
-                        toffset_start = toffset_start + remainder
-                        !print *,'new remainder+set (new offsets)',yty_remainder,yty_column_start,toffset_start
-                    end if
-
-                else
-                    if (total_remainder .gt. 0) then
-                        ! merge with existing global remainder part
-                        !print *,'single merging',yty_remainder,total_remainder,remainder
-
-                        call qr_pdlarft_merge_kernel_local(remainder,total_remainder,t(1,1),ldt,yty(1,yty_remainder),ldy)
-
-                        yty_remainder = yty_remainder + remainder
-                        toffset_start = toffset_start + remainder
-
-                        !print *,'single merging (new offsets)',yty_remainder,yty_column_start,toffset_start
-
-                        yty_column_start = yty_column_start + remainder
-                    else
-                        ! create new remainder part
-                        !print *,'new remainder',yty_remainder
-                        yty_remainder = yty_column_start
-                        yty_column_start = yty_column_start + temp_blocksize
-                        toffset_start = toffset_start + remainder
-                        !print *,'new remainder (new offsets)',yty_remainder,yty_column_start,toffset_start
-                    end if
-                end if
-
-                total_remainder = total_remainder + remainder
-                remaining_size = remaining_size - remainder
-            end if
-
-            current_set = 0
-            do while (current_set .lt. nr_sets)
-                toffset = toffset_start + current_set * setsize
-                yty_column = yty_column_start + current_set * setsize
-
-                !print *,'recursive merging', toffset, yty_column,setsize
-
-                call qr_tmerge_set_kernel(setsize,temp_blocksize,t(toffset,toffset),ldt,yty(toffset,yty_column),ldy)
-
-                current_set = current_set +  1
-            end do
-
-            !print *,'increasing blocksize', temp_blocksize, setsize
-            yty_column_start = yty_column_start + (setsize - temp_blocksize)
-            temp_blocksize = setsize
-        end do
-end subroutine
-! yty should not contain the inner products vi'*vi
-subroutine qr_dlarft_kernel(n,tau,yty,ldy,t,ldt)
-    use precision
-    implicit none
-
-    ! input variables
-    integer(kind=ik) :: n,ldy,ldt
-    real(kind=rk)    :: tau(*),yty(ldy,*)
-
-    ! output variables
-    real(kind=rk)    :: t(ldt,*)
-
-    ! local variables
-    integer(kind=ik) :: icol
-
-    ! DEBUG: clear buffer first
-    !t(1:n,1:n) = 0.0d0
-
-        ! T1 = tau1
-        ! | tauk  Tk-1' * (-tauk * Y(:,1,k+1:n) * Y(:,k))' |
-        ! | 0           Tk-1                           |
-        t(n,n) = tau(n)
-        do icol=n-1,1,-1
-            t(icol,icol+1:n) = -tau(icol)*yty(icol,icol:n-1)
-            call dtrmv("Upper","Trans","Nonunit",n-icol,t(icol+1,icol+1),ldt,t(icol,icol+1),ldt)
-            t(icol,icol) = tau(icol)
-        end do
-end subroutine
diff --git a/src/elpa_qr/qr_utils.f90 b/src/elpa_qr/qr_utils.f90
deleted file mode 100644
index 5a8eb15dc..000000000
--- a/src/elpa_qr/qr_utils.f90
+++ /dev/null
@@ -1,396 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-module qr_utils_mod
-
-    implicit none
-
-    PRIVATE
-
-    public :: local_size_offset_1d
-    public :: reverse_vector_local
-    public :: reverse_matrix_local
-    public :: reverse_matrix_1dcomm
-    public :: reverse_matrix_2dcomm_ref
-
-contains
-
-! rev parameter is critical, even in rev only mode!
-! pdgeqrf_2dcomm uses rev=0 version to determine the process columns
-! involved in the qr decomposition
-subroutine local_size_offset_1d(n,nb,baseidx,idx,rev,rank,nprocs, &
-                                lsize,baseoffset,offset)
-
-    use precision
-    use ELPA1_compute
-
-    implicit none
-
-    ! input
-    integer(kind=ik) :: n,nb,baseidx,idx,rev,rank,nprocs
-
-    ! output
-    integer(kind=ik) :: lsize,baseoffset,offset
-
-    ! local scalars
-    integer(kind=ik) :: rank_idx
-
-    rank_idx = MOD((idx-1)/nb,nprocs)
-
-    ! calculate local size and offsets
-    if (rev .eq. 1) then
-        if (idx > 0) then
-            lsize = local_index(idx,rank,nprocs,nb,-1)
-        else
-            lsize = 0
-        end if
-
-        baseoffset = 1
-        offset = 1
-    else
-        offset = local_index(idx,rank,nprocs,nb,1)
-        baseoffset = local_index(baseidx,rank,nprocs,nb,1)
-
-        lsize = local_index(n,rank,nprocs,nb,-1)
-        !print *,'baseidx,idx',baseidx,idx,lsize,n
-
-        lsize = lsize - offset + 1
-
-        baseoffset = offset - baseoffset + 1
-    end if
-
-end subroutine local_size_offset_1d
-
-
-subroutine reverse_vector_local(n,x,incx,work,lwork)
-    use precision
-    implicit none
-
-    ! input
-    integer(kind=ik) :: incx,n,lwork
-    real(kind=rk)    :: x(*),work(*)
-
-    ! local scalars
-    real(kind=rk)    :: temp
-    integer(kind=ik) :: srcoffset,destoffset,ientry
-
-    if (lwork .eq. -1) then
-        work(1) = 0.0d0
-        return
-    end if
-
-    do ientry=1,n/2
-        srcoffset=1+(ientry-1)*incx
-        destoffset=1+(n-ientry)*incx
-
-        temp = x(srcoffset)
-        x(srcoffset) = x(destoffset)
-        x(destoffset) = temp
-    end do
-
-end subroutine reverse_vector_local
-
-subroutine reverse_matrix_local(trans,m,n,a,lda,work,lwork)
-    use precision
-    implicit none
-
-    ! input
-    integer(kind=ik) :: lda,m,n,lwork,trans
-    real(kind=rk)    :: a(lda,*),work(*)
-
-    ! local scalars
-    real(kind=rk)    :: temp, dworksize(1)
-    integer(kind=ik) :: incx
-    integer(kind=ik) :: dimsize
-    integer(kind=ik) :: i
-
-    if (trans .eq. 1) then
-        incx = lda
-        dimsize = n
-    else
-        incx = 1
-        dimsize = m
-    end if
-
-    if (lwork .eq. -1) then
-        call reverse_vector_local(dimsize,a,incx,dworksize,-1)
-        work(1) = dworksize(1)
-        return
-    end if
-
-    if (trans .eq. 1) then
-        do i=1,m
-            call reverse_vector_local(dimsize,a(i,1),incx,work,lwork)
-        end do
-    else
-        do i=1,n
-            call reverse_vector_local(dimsize,a(1,i),incx,work,lwork)
-        end do
-    end if
-
-end subroutine reverse_matrix_local
-
-subroutine reverse_matrix_2dcomm_ref(m,n,mb,nb,a,lda,work,lwork,mpicomm_cols,mpicomm_rows)
-    use precision
-    implicit none
-
-    ! input
-    integer(kind=ik) :: m,n,lda,lwork,mpicomm_cols,mpicomm_rows,mb,nb
-    real(kind=rk)    :: a(lda,*),work(*)
-
-    ! local scalars
-    real(kind=rk)    :: reverse_column_size(1)
-    real(kind=rk)    :: reverse_row_size(1)
-
-    integer(kind=ik) :: mpirank_cols,mpirank_rows
-    integer(kind=ik) :: mpiprocs_cols,mpiprocs_rows
-    integer(kind=ik) :: mpierr
-    integer(kind=ik) :: lrows,lcols,offset,baseoffset
-
-    call MPI_Comm_rank(mpicomm_cols,mpirank_cols,mpierr)
-    call MPI_Comm_rank(mpicomm_rows,mpirank_rows,mpierr)
-    call MPI_Comm_size(mpicomm_cols,mpiprocs_cols,mpierr)
-    call MPI_Comm_size(mpicomm_rows,mpiprocs_rows,mpierr)
-
-    call local_size_offset_1d(m,mb,1,1,0,mpirank_cols,mpiprocs_cols, &
-                                  lrows,baseoffset,offset)
-
-    call local_size_offset_1d(n,nb,1,1,0,mpirank_rows,mpiprocs_rows, &
-                                  lcols,baseoffset,offset)
-
-    if (lwork .eq. -1) then
-        call reverse_matrix_1dcomm(0,m,lcols,mb,a,lda,reverse_column_size,-1,mpicomm_cols)
-        call reverse_matrix_1dcomm(1,lrows,n,nb,a,lda,reverse_row_size,-1,mpicomm_rows)
-        work(1) = max(reverse_column_size(1),reverse_row_size(1))
-        return
-    end if
-
-    call reverse_matrix_1dcomm(0,m,lcols,mb,a,lda,work,lwork,mpicomm_cols)
-    call reverse_matrix_1dcomm(1,lrows,n,nb,a,lda,work,lwork,mpicomm_rows)
-end subroutine reverse_matrix_2dcomm_ref
-
-! b: if trans = 'N': b is size of block distribution between rows
-! b: if trans = 'T': b is size of block distribution between columns
-subroutine reverse_matrix_1dcomm(trans,m,n,b,a,lda,work,lwork,mpicomm)
-    use precision
-    use mpi
-
-    implicit none
-
-    ! input
-    integer(kind=ik) :: trans
-    integer(kind=ik) :: m,n,b,lda,lwork,mpicomm
-    real(kind=rk)    :: a(lda,*),work(*)
-
-    ! local scalars
-    integer(kind=ik) :: mpirank,mpiprocs,mpierr,mpistatus(MPI_STATUS_SIZE)
-    integer(kind=ik) :: nr_blocks,dest_process,src_process,step
-    integer(kind=ik) :: lsize,baseoffset,offset
-    integer(kind=ik) :: current_index,destblk,srcblk,icol,next_index
-    integer(kind=ik) :: sendcount,recvcount
-    integer(kind=ik) :: sendoffset,recvoffset
-    integer(kind=ik) :: newmatrix_offset,work_offset
-    integer(kind=ik) :: lcols,lrows,lroffset,lcoffset,dimsize,fixedsize
-    real(kind=rk)    :: dworksize(1)
-
-    call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-    call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-
-    if (trans .eq. 1) then
-        call local_size_offset_1d(n,b,1,1,0,mpirank,mpiprocs, &
-                                  lcols,baseoffset,lcoffset)
-        lrows = m
-    else
-        call local_size_offset_1d(m,b,1,1,0,mpirank,mpiprocs, &
-                                  lrows,baseoffset,lroffset)
-        lcols = n
-    end if
-
-    if (lwork .eq. -1) then
-        call reverse_matrix_local(trans,lrows,lcols,a,max(lrows,lcols),dworksize,-1)
-        work(1) = DBLE(3*lrows*lcols) + dworksize(1)
-        return
-    end if
-
-    sendoffset = 1
-    recvoffset = sendoffset + lrows*lcols
-    newmatrix_offset = recvoffset + lrows*lcols
-    work_offset = newmatrix_offset + lrows*lcols
-
-    if (trans .eq. 1) then
-        dimsize = n
-        fixedsize = m
-    else
-        dimsize = m
-        fixedsize = n
-    end if
-
-    if (dimsize .le. 1) then
-        return ! nothing to do
-    end if
-
-    ! 1. adjust step size to remainder size
-    nr_blocks = dimsize / b
-    nr_blocks = nr_blocks * b
-    step = dimsize - nr_blocks
-    if (step .eq. 0) step = b
-
-    ! 2. iterate over destination blocks starting with process 0
-    current_index = 1
-    do while (current_index .le. dimsize)
-        destblk = (current_index-1) / b
-        dest_process = mod(destblk,mpiprocs)
-        srcblk = (dimsize-current_index) / b
-        src_process = mod(srcblk,mpiprocs)
-
-        next_index = current_index+step
-
-        ! block for dest_process is located on mpirank if lsize > 0
-        call local_size_offset_1d(dimsize-current_index+1,b,dimsize-next_index+2,dimsize-next_index+2,0, &
-                                  src_process,mpiprocs,lsize,baseoffset,offset)
-
-        sendcount = lsize*fixedsize
-        recvcount = sendcount
-
-        ! TODO: this send/recv stuff seems to blow up on BlueGene/P
-        ! TODO: is there actually room for the requested matrix part? the target
-        ! process might not have any parts at all (thus no room)
-        if ((src_process .eq. mpirank) .and. (dest_process .eq. src_process)) then
-                ! 5. pack data
-                if (trans .eq. 1) then
-                    do icol=offset,offset+lsize-1
-                        work(sendoffset+(icol-offset)*lrows:sendoffset+(icol-offset+1)*lrows-1) = &
-                            a(1:lrows,icol)
-                    end do
-                else
-                    do icol=1,lcols
-                        work(sendoffset+(icol-1)*lsize:sendoffset+icol*lsize-1) = &
-                            a(offset:offset+lsize-1,icol)
-                    end do
-                end if
-
-                ! 7. reverse data
-                if (trans .eq. 1) then
-                    call reverse_matrix_local(1,lrows,lsize,work(sendoffset),lrows,work(work_offset),lwork)
-                else
-                    call reverse_matrix_local(0,lsize,lcols,work(sendoffset),lsize,work(work_offset),lwork)
-                end if
-
-                ! 8. store in temp matrix
-                if (trans .eq. 1) then
-                    do icol=1,lsize
-                        work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+icol*lrows-1) = &
-                            work(sendoffset+(icol-1)*lrows:sendoffset+icol*lrows-1)
-                    end do
-
-                    newmatrix_offset = newmatrix_offset + lsize*lrows
-                else
-                    do icol=1,lcols
-                        work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+(icol-1)*lrows+lsize-1) = &
-                            work(sendoffset+(icol-1)*lsize:sendoffset+icol*lsize-1)
-                    end do
-
-                    newmatrix_offset = newmatrix_offset + lsize
-                end if
-        else
-
-            if (dest_process .eq. mpirank) then
-                ! 6b. call MPI_Recv
-                call MPI_Recv(work(recvoffset), recvcount, mpi_real8, &
-                              src_process, current_index, mpicomm, mpistatus, mpierr)
-
-                ! 7. reverse data
-                if (trans .eq. 1) then
-                    call reverse_matrix_local(1,lrows,lsize,work(recvoffset),lrows,work(work_offset),lwork)
-                else
-                    call reverse_matrix_local(0,lsize,lcols,work(recvoffset),lsize,work(work_offset),lwork)
-                end if
-
-                ! 8. store in temp matrix
-                if (trans .eq. 1) then
-                    do icol=1,lsize
-                        work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+icol*lrows-1) = &
-                            work(recvoffset+(icol-1)*lrows:recvoffset+icol*lrows-1)
-                    end do
-
-                    newmatrix_offset = newmatrix_offset + lsize*lrows
-                else
-                    do icol=1,lcols
-                        work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+(icol-1)*lrows+lsize-1) = &
-                            work(recvoffset+(icol-1)*lsize:recvoffset+icol*lsize-1)
-                    end do
-
-                    newmatrix_offset = newmatrix_offset + lsize
-                end if
-            end if
-
-            if (src_process .eq. mpirank) then
-                ! 5. pack data
-                if (trans .eq. 1) then
-                    do icol=offset,offset+lsize-1
-                        work(sendoffset+(icol-offset)*lrows:sendoffset+(icol-offset+1)*lrows-1) = &
-                            a(1:lrows,icol)
-                    end do
-                else
-                    do icol=1,lcols
-                        work(sendoffset+(icol-1)*lsize:sendoffset+icol*lsize-1) = &
-                            a(offset:offset+lsize-1,icol)
-                    end do
-                end if
-
-                ! 6a. call MPI_Send
-                call MPI_Send(work(sendoffset), sendcount, mpi_real8, &
-                                  dest_process, current_index, mpicomm, mpierr)
-            end if
-        end if
-
-        current_index = next_index
-    end do
-
-   ! 9. copy temp matrix to real matrix
-   newmatrix_offset = recvoffset + lrows*lcols
-   do icol=1,lcols
-        a(1:lrows,icol) = &
-            work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+icol*lrows-1)
-   end do
-end subroutine reverse_matrix_1dcomm
-end module
diff --git a/src/elpa_reduce_add_vectors.X90 b/src/elpa_reduce_add_vectors.X90
deleted file mode 100644
index 9651a92ec..000000000
--- a/src/elpa_reduce_add_vectors.X90
+++ /dev/null
@@ -1,188 +0,0 @@
-#if 0
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-#endif
-
-#if REALCASE==1
-subroutine elpa_reduce_add_vectors_real(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvr,nvc,nblk)
-#endif
-#if COMPLEXCASE==1
-subroutine elpa_reduce_add_vectors_complex(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvr,nvc,nblk)
-#endif
-
-!-------------------------------------------------------------------------------
-! This routine does a reduce of all vectors in vmat_s over the communicator comm_t.
-! The result of the reduce is gathered on the processors owning the diagonal
-! and added to the array of vectors vmat_t (which is distributed over comm_t).
-!
-! Opposed to elpa_transpose_vectors, there is NO identical copy of vmat_s
-! in the different members within vmat_t (else a reduce wouldn't be necessary).
-! After this routine, an allreduce of vmat_t has to be done.
-!
-! vmat_s    array of vectors to be reduced and added
-! ld_s      leading dimension of vmat_s
-! comm_s    communicator over which vmat_s is distributed
-! vmat_t    array of vectors to which vmat_s is added
-! ld_t      leading dimension of vmat_t
-! comm_t    communicator over which vmat_t is distributed
-! nvr       global length of vmat_s/vmat_t
-! nvc       number of columns in vmat_s/vmat_t
-! nblk      block size of block cyclic distribution
-!
-!-------------------------------------------------------------------------------
-
-   use precision
-!   use ELPA1 ! for least_common_multiple
-#ifdef WITH_OPENMP
-   use omp_lib
-#endif
-   implicit none
-
-   include 'mpif.h'
-
-   integer(kind=ik), intent(in)              :: ld_s, comm_s, ld_t, comm_t, nvr, nvc, nblk
-   DATATYPE, intent(in)                      :: vmat_s(ld_s,nvc)
-   DATATYPE, intent(inout)                   :: vmat_t(ld_t,nvc)
-
-   DATATYPE, allocatable                     :: aux1(:), aux2(:)
-   integer(kind=ik)                          :: myps, mypt, nps, npt
-   integer(kind=ik)                          :: n, lc, k, i, ips, ipt, ns, nl, mpierr
-   integer(kind=ik)                          :: lcm_s_t, nblks_tot
-   integer(kind=ik)                          :: auxstride, tylerk, error_unit
-
-   call mpi_comm_rank(comm_s,myps,mpierr)
-   call mpi_comm_size(comm_s,nps ,mpierr)
-   call mpi_comm_rank(comm_t,mypt,mpierr)
-   call mpi_comm_size(comm_t,npt ,mpierr)
-
-   ! Look to elpa_transpose_vectors for the basic idea!
-
-   ! The communictation pattern repeats in the global matrix after
-   ! the least common multiple of (nps,npt) blocks
-
-   lcm_s_t   = least_common_multiple(nps,npt) ! least common multiple of nps, npt
-
-   nblks_tot = (nvr+nblk-1)/nblk ! number of blocks corresponding to nvr
-
-   allocate(aux1( ((nblks_tot+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
-   allocate(aux2( ((nblks_tot+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
-   aux1(:) = 0
-   aux2(:) = 0
-#ifdef WITH_OPENMP
-   !$omp parallel private(ips, ipt, auxstride, lc, i, k, ns, nl)
-#endif
-   do n = 0, lcm_s_t-1
-
-      ips = mod(n,nps)
-      ipt = mod(n,npt)
-
-      auxstride = nblk * ((nblks_tot - n + lcm_s_t - 1)/lcm_s_t)
-
-      if(myps == ips) then
-
-!         k = 0
-#ifdef WITH_OPENMP
-         !$omp do
-#endif
-         do lc=1,nvc
-            do i = n, nblks_tot-1, lcm_s_t
-	       k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride
-               ns = (i/nps)*nblk ! local start of block i
-               nl = min(nvr-i*nblk,nblk) ! length
-               aux1(k+1:k+nl) = vmat_s(ns+1:ns+nl,lc)
-!               k = k+nblk
-            enddo
-         enddo
-
-         k = nvc * auxstride
-#ifdef WITH_OPENMP
-         !$omp barrier
-         !$omp master
-#endif
-#if REALCASE==1
-         if(k>0) call mpi_reduce(aux1,aux2,k,MPI_REAL8,MPI_SUM,ipt,comm_t,mpierr)
-#endif
-
-#if COMPLEXCASE==1
-         if(k>0) call mpi_reduce(aux1,aux2,k,MPI_DOUBLE_COMPLEX,MPI_SUM,ipt,comm_t,mpierr)
-#endif
-
-#ifdef WITH_OPENMP
-         !$omp end master
-         !$omp barrier
-#endif
-         if (mypt == ipt) then
-!            k = 0
-#ifdef WITH_OPENMP
-         !$omp do
-#endif
-            do lc=1,nvc
-               do i = n, nblks_tot-1, lcm_s_t
-	          k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride
-                  ns = (i/npt)*nblk ! local start of block i
-                  nl = min(nvr-i*nblk,nblk) ! length
-                  vmat_t(ns+1:ns+nl,lc) = vmat_t(ns+1:ns+nl,lc) + aux2(k+1:k+nl)
-!                  k = k+nblk
-               enddo
-            enddo
-         endif
-
-      endif
-
-   enddo
-#ifdef WITH_OPENMP
-   !$omp end parallel
-#endif
-
-   deallocate(aux1)
-   deallocate(aux2)
-
-end subroutine
-
-
diff --git a/src/elpa_transpose_vectors.X90 b/src/elpa_transpose_vectors.X90
deleted file mode 100644
index 94543a2da..000000000
--- a/src/elpa_transpose_vectors.X90
+++ /dev/null
@@ -1,190 +0,0 @@
-#if 0
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-#endif
-
-#if REALCASE==1
-subroutine elpa_transpose_vectors_real(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvs,nvr,nvc,nblk)
-#endif
-#if COMPLEXCASE==1
-subroutine elpa_transpose_vectors_complex(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvs,nvr,nvc,nblk)
-#endif
-
-!-------------------------------------------------------------------------------
-! This routine transposes an array of vectors which are distributed in
-! communicator comm_s into its transposed form distributed in communicator comm_t.
-! There must be an identical copy of vmat_s in every communicator comm_s.
-! After this routine, there is an identical copy of vmat_t in every communicator comm_t.
-!
-! vmat_s    original array of vectors
-! ld_s      leading dimension of vmat_s
-! comm_s    communicator over which vmat_s is distributed
-! vmat_t    array of vectors in transposed form
-! ld_t      leading dimension of vmat_t
-! comm_t    communicator over which vmat_t is distributed
-! nvs       global index where to start in vmat_s/vmat_t
-!           Please note: this is kind of a hint, some values before nvs will be
-!           accessed in vmat_s/put into vmat_t
-! nvr       global length of vmat_s/vmat_t
-! nvc       number of columns in vmat_s/vmat_t
-! nblk      block size of block cyclic distribution
-!
-!-------------------------------------------------------------------------------
-    use precision
-
-!   use ELPA1 ! for least_common_multiple
-#ifdef WITH_OPENMP
-   use omp_lib
-#endif
-
-   implicit none
-
-   include 'mpif.h'
-
-   integer(kind=ik), intent(in)              :: ld_s, comm_s, ld_t, comm_t, nvs, nvr, nvc, nblk
-   DATATYPE, intent(in)                      :: vmat_s(ld_s,nvc)
-   DATATYPE, intent(inout)                   :: vmat_t(ld_t,nvc)
-
-   DATATYPE, allocatable                     :: aux(:)
-   integer(kind=ik)                          :: myps, mypt, nps, npt
-   integer(kind=ik)                          :: n, lc, k, i, ips, ipt, ns, nl, mpierr
-   integer(kind=ik)                          :: lcm_s_t, nblks_tot, nblks_comm, nblks_skip
-   integer(kind=ik)                          :: auxstride
-
-   call mpi_comm_rank(comm_s,myps,mpierr)
-   call mpi_comm_size(comm_s,nps ,mpierr)
-   call mpi_comm_rank(comm_t,mypt,mpierr)
-   call mpi_comm_size(comm_t,npt ,mpierr)
-
-   ! The basic idea of this routine is that for every block (in the block cyclic
-   ! distribution), the processor within comm_t which owns the diagonal
-   ! broadcasts its values of vmat_s to all processors within comm_t.
-   ! Of course this has not to be done for every block separately, since
-   ! the communictation pattern repeats in the global matrix after
-   ! the least common multiple of (nps,npt) blocks
-
-   lcm_s_t   = least_common_multiple(nps,npt) ! least common multiple of nps, npt
-
-   nblks_tot = (nvr+nblk-1)/nblk ! number of blocks corresponding to nvr
-
-   ! Get the number of blocks to be skipped at the begin.
-   ! This must be a multiple of lcm_s_t (else it is getting complicated),
-   ! thus some elements before nvs will be accessed/set.
-
-   nblks_skip = ((nvs-1)/(nblk*lcm_s_t))*lcm_s_t
-
-   allocate(aux( ((nblks_tot-nblks_skip+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
-#ifdef WITH_OPENMP
-   !$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
-#endif
-   do n = 0, lcm_s_t-1
-
-      ips = mod(n,nps)
-      ipt = mod(n,npt)
-
-      if(mypt == ipt) then
-
-         nblks_comm = (nblks_tot-nblks_skip-n+lcm_s_t-1)/lcm_s_t
-         auxstride = nblk * nblks_comm
-!         if(nblks_comm==0) cycle
-         if (nblks_comm .ne. 0) then
-         if(myps == ips) then
-!            k = 0
-#ifdef WITH_OPENMP
-            !$omp do
-#endif
-            do lc=1,nvc
-               do i = nblks_skip+n, nblks_tot-1, lcm_s_t
-                  k = (i - nblks_skip - n)/lcm_s_t * nblk + (lc - 1) * auxstride
-                  ns = (i/nps)*nblk ! local start of block i
-                  nl = min(nvr-i*nblk,nblk) ! length
-                  aux(k+1:k+nl) = vmat_s(ns+1:ns+nl,lc)
-!                  k = k+nblk
-               enddo
-            enddo
-         endif
-
-#ifdef WITH_OPENMP
-         !$omp barrier
-         !$omp master
-#endif
-#if COMPLEXCASE==1
-         call MPI_Bcast(aux,nblks_comm*nblk*nvc,MPI_DOUBLE_COMPLEX,ips,comm_s,mpierr)
-#endif
-
-#if REALCASE==1
-         call MPI_Bcast(aux,nblks_comm*nblk*nvc,MPI_REAL8,ips,comm_s,mpierr)
-#endif
-#ifdef WITH_OPENMP
-         !$omp end master
-         !$omp barrier
-
-         !$omp do
-#endif
-!         k = 0
-         do lc=1,nvc
-            do i = nblks_skip+n, nblks_tot-1, lcm_s_t
-               k = (i - nblks_skip - n)/lcm_s_t * nblk + (lc - 1) * auxstride
-               ns = (i/npt)*nblk ! local start of block i
-               nl = min(nvr-i*nblk,nblk) ! length
-               vmat_t(ns+1:ns+nl,lc) = aux(k+1:k+nl)
-!               k = k+nblk
-            enddo
-         enddo
-         endif
-      endif
-
-   enddo
-#ifdef WITH_OPENMP
-   !$omp end parallel
-#endif
-   deallocate(aux)
-
-end subroutine
-
diff --git a/src/elpa_utilities.F90 b/src/elpa_utilities.F90
deleted file mode 100644
index a743c5dab..000000000
--- a/src/elpa_utilities.F90
+++ /dev/null
@@ -1,136 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-
-
-! ELPA2 -- 2-stage solver for ELPA
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-
-#include "config-f90.h"
-
-module ELPA_utilities
-
-#ifdef HAVE_ISO_FORTRAN_ENV
-  use iso_fortran_env, only : error_unit
-#endif
-  use precision
-  implicit none
-
-  private ! By default, all routines contained are private
-
-  public :: debug_messages_via_environment_variable, pcol, prow, error_unit
-#ifndef HAVE_ISO_FORTRAN_ENV
-  integer(kind=ik), parameter :: error_unit = 0
-#endif
-
-
-  !******
-  contains
-
-   function debug_messages_via_environment_variable() result(isSet)
-#ifdef HAVE_DETAILED_TIMINGS
-     use timings
-#endif
-     use precision
-     implicit none
-     logical              :: isSet
-     CHARACTER(len=255)   :: ELPA_DEBUG_MESSAGES
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%start("debug_messages_via_environment_variable")
-#endif
-
-     isSet = .false.
-
-#if defined(HAVE_ENVIRONMENT_CHECKING)
-     call get_environment_variable("ELPA_DEBUG_MESSAGES",ELPA_DEBUG_MESSAGES)
-#endif
-     if (trim(ELPA_DEBUG_MESSAGES) .eq. "yes") then
-       isSet = .true.
-     endif
-     if (trim(ELPA_DEBUG_MESSAGES) .eq. "no") then
-       isSet = .true.
-     endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop("debug_messages_via_environment_variable")
-#endif
-
-   end function debug_messages_via_environment_variable
-
-!-------------------------------------------------------------------------------
-
-  !Processor col for global col number
-  pure function pcol(i, nblk, np_cols) result(col)
-    use precision
-    implicit none
-    integer(kind=ik), intent(in) :: i, nblk, np_cols
-    integer(kind=ik)             :: col
-    col = MOD((i-1)/nblk,np_cols)
-  end function
-
-!-------------------------------------------------------------------------------
-
-  !Processor row for global row number
-  pure function prow(i, nblk, np_rows) result(row)
-    use precision
-    implicit none
-    integer(kind=ik), intent(in) :: i, nblk, np_rows
-    integer(kind=ik)             :: row
-    row = MOD((i-1)/nblk,np_rows)
-  end function
-
-!-------------------------------------------------------------------------------
-
-end module ELPA_utilities
diff --git a/src/ftimings/COPYING.LESSER b/src/ftimings/COPYING.LESSER
deleted file mode 100644
index 65c5ca88a..000000000
--- a/src/ftimings/COPYING.LESSER
+++ /dev/null
@@ -1,165 +0,0 @@
-                   GNU LESSER GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-
-  This version of the GNU Lesser General Public License incorporates
-the terms and conditions of version 3 of the GNU General Public
-License, supplemented by the additional permissions listed below.
-
-  0. Additional Definitions.
-
-  As used herein, "this License" refers to version 3 of the GNU Lesser
-General Public License, and the "GNU GPL" refers to version 3 of the GNU
-General Public License.
-
-  "The Library" refers to a covered work governed by this License,
-other than an Application or a Combined Work as defined below.
-
-  An "Application" is any work that makes use of an interface provided
-by the Library, but which is not otherwise based on the Library.
-Defining a subclass of a class defined by the Library is deemed a mode
-of using an interface provided by the Library.
-
-  A "Combined Work" is a work produced by combining or linking an
-Application with the Library.  The particular version of the Library
-with which the Combined Work was made is also called the "Linked
-Version".
-
-  The "Minimal Corresponding Source" for a Combined Work means the
-Corresponding Source for the Combined Work, excluding any source code
-for portions of the Combined Work that, considered in isolation, are
-based on the Application, and not on the Linked Version.
-
-  The "Corresponding Application Code" for a Combined Work means the
-object code and/or source code for the Application, including any data
-and utility programs needed for reproducing the Combined Work from the
-Application, but excluding the System Libraries of the Combined Work.
-
-  1. Exception to Section 3 of the GNU GPL.
-
-  You may convey a covered work under sections 3 and 4 of this License
-without being bound by section 3 of the GNU GPL.
-
-  2. Conveying Modified Versions.
-
-  If you modify a copy of the Library, and, in your modifications, a
-facility refers to a function or data to be supplied by an Application
-that uses the facility (other than as an argument passed when the
-facility is invoked), then you may convey a copy of the modified
-version:
-
-   a) under this License, provided that you make a good faith effort to
-   ensure that, in the event an Application does not supply the
-   function or data, the facility still operates, and performs
-   whatever part of its purpose remains meaningful, or
-
-   b) under the GNU GPL, with none of the additional permissions of
-   this License applicable to that copy.
-
-  3. Object Code Incorporating Material from Library Header Files.
-
-  The object code form of an Application may incorporate material from
-a header file that is part of the Library.  You may convey such object
-code under terms of your choice, provided that, if the incorporated
-material is not limited to numerical parameters, data structure
-layouts and accessors, or small macros, inline functions and templates
-(ten or fewer lines in length), you do both of the following:
-
-   a) Give prominent notice with each copy of the object code that the
-   Library is used in it and that the Library and its use are
-   covered by this License.
-
-   b) Accompany the object code with a copy of the GNU GPL and this license
-   document.
-
-  4. Combined Works.
-
-  You may convey a Combined Work under terms of your choice that,
-taken together, effectively do not restrict modification of the
-portions of the Library contained in the Combined Work and reverse
-engineering for debugging such modifications, if you also do each of
-the following:
-
-   a) Give prominent notice with each copy of the Combined Work that
-   the Library is used in it and that the Library and its use are
-   covered by this License.
-
-   b) Accompany the Combined Work with a copy of the GNU GPL and this license
-   document.
-
-   c) For a Combined Work that displays copyright notices during
-   execution, include the copyright notice for the Library among
-   these notices, as well as a reference directing the user to the
-   copies of the GNU GPL and this license document.
-
-   d) Do one of the following:
-
-       0) Convey the Minimal Corresponding Source under the terms of this
-       License, and the Corresponding Application Code in a form
-       suitable for, and under terms that permit, the user to
-       recombine or relink the Application with a modified version of
-       the Linked Version to produce a modified Combined Work, in the
-       manner specified by section 6 of the GNU GPL for conveying
-       Corresponding Source.
-
-       1) Use a suitable shared library mechanism for linking with the
-       Library.  A suitable mechanism is one that (a) uses at run time
-       a copy of the Library already present on the user's computer
-       system, and (b) will operate properly with a modified version
-       of the Library that is interface-compatible with the Linked
-       Version.
-
-   e) Provide Installation Information, but only if you would otherwise
-   be required to provide such information under section 6 of the
-   GNU GPL, and only to the extent that such information is
-   necessary to install and execute a modified version of the
-   Combined Work produced by recombining or relinking the
-   Application with a modified version of the Linked Version. (If
-   you use option 4d0, the Installation Information must accompany
-   the Minimal Corresponding Source and Corresponding Application
-   Code. If you use option 4d1, you must provide the Installation
-   Information in the manner specified by section 6 of the GNU GPL
-   for conveying Corresponding Source.)
-
-  5. Combined Libraries.
-
-  You may place library facilities that are a work based on the
-Library side by side in a single library together with other library
-facilities that are not Applications and are not covered by this
-License, and convey such a combined library under terms of your
-choice, if you do both of the following:
-
-   a) Accompany the combined library with a copy of the same work based
-   on the Library, uncombined with any other library facilities,
-   conveyed under the terms of this License.
-
-   b) Give prominent notice with the combined library that part of it
-   is a work based on the Library, and explaining where to find the
-   accompanying uncombined form of the same work.
-
-  6. Revised Versions of the GNU Lesser General Public License.
-
-  The Free Software Foundation may publish revised and/or new versions
-of the GNU Lesser General Public License from time to time. Such new
-versions will be similar in spirit to the present version, but may
-differ in detail to address new problems or concerns.
-
-  Each version is given a distinguishing version number. If the
-Library as you received it specifies that a certain numbered version
-of the GNU Lesser General Public License "or any later version"
-applies to it, you have the option of following the terms and
-conditions either of that published version or of any later version
-published by the Free Software Foundation. If the Library as you
-received it does not specify a version number of the GNU Lesser
-General Public License, you may choose any version of the GNU Lesser
-General Public License ever published by the Free Software Foundation.
-
-  If the Library as you received it specifies that a proxy can decide
-whether future versions of the GNU Lesser General Public License shall
-apply, that proxy's public statement of acceptance of any version is
-permanent authorization for you to choose that version for the
-Library.
diff --git a/src/ftimings/ftimings.F90 b/src/ftimings/ftimings.F90
deleted file mode 100644
index 7c0c4517a..000000000
--- a/src/ftimings/ftimings.F90
+++ /dev/null
@@ -1,1472 +0,0 @@
-! Copyright 2014 Lorenz Hüdepohl
-!
-! This file is part of ftimings.
-!
-! ftimings is free software: you can redistribute it and/or modify
-! it under the terms of the GNU Lesser General Public License as published by
-! the Free Software Foundation, either version 3 of the License, or
-! (at your option) any later version.
-!
-! ftimings is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-! GNU Lesser General Public License for more details.
-!
-! You should have received a copy of the GNU Lesser General Public License
-! along with ftimings.  If not, see <http://www.gnu.org/licenses/>.
-
-#ifdef HAVE_CONFIG_H
-#include "config-f90.h"
-#endif
-
-!> \mainpage Ftimings
-!>
-!> An almost pure-fortran attempt to play with tree structures, which evolved
-!> into the timing library used e.g. by the VERTEX supernova code.
-!>
-!> All you need to know is contained in the \ref ftimings::timer_t derived type.
-module ftimings
-  use ftimings_type
-  use ftimings_value
-  use, intrinsic :: iso_fortran_env, only : error_unit, output_unit
-  implicit none
-  save
-
-  private
-
-  ! this is mainly needed for Doxygen, they are
-  ! by implicitly reachable as type-bound procedures
-  ! of timer_t, however Doxygen does not document them
-  ! if they are not also public
-  public timer_start, timer_stop, timer_free, timer_print, &
-         timer_enable, timer_disable, timer_is_enabled, &
-         timer_in_entries, timer_get, timer_since, timer_sort, &
-         timer_set_print_options, &
-         timer_measure_flops, &
-         timer_measure_allocated_memory, &
-         timer_measure_virtual_memory,   &
-         timer_measure_max_allocated_memory,   &
-         timer_measure_memory_bandwidth
-
-  character(len=name_length), private, parameter :: own = "(own)"
-  character(len=name_length), private, parameter :: below = "(below threshold)"
-
-  !> Type for a timer instance.
-  !>
-  !> Typical usage:
-  !> \code{.f90}
-  !>   type(timer_t) :: timer
-  !>
-  !>   call timer%enable()
-  !>
-  !>   call timer%start("section")
-  !>     ...
-  !>   call timer%start("subsection")
-  !>     ...
-  !>   call timer%stop("subsection")
-  !>     ...
-  !>   call timer%stop("section")
-  !>
-  !>   call timer%print()
-  !> \endcode
-  !>
-  !> Every first call to timer%start() at a certain point in the graph
-  !> allocates a small amount of memory. If the timer is no longer needed,
-  !> all that memory can be freed again with
-  !>
-  !> \code{.f90}
-  !>   call timer%free()
-  !> \endcode
-  type, public :: timer_t
-    logical, private :: active = .false.                         !< If set to .false., most operations return immediately without any action
-    logical, private :: record_allocated_memory = .false.        !< IF set to .true., record also the current resident set size
-    logical, private :: record_virtual_memory = .false.          !< IF set to .true., record also the virtual memory
-    logical, private :: record_max_allocated_memory = .false.    !< IF set to .true., record also the max resident set size ("high water mark")
-    logical, private :: record_flop_counts = .false.             !< If set to .true., record also FLOP counts via PAPI calls
-    logical, private :: record_memory_bandwidth = .false.        !< If set to .true., record also FLOP counts via PAPI calls
-
-    logical, private :: print_allocated_memory = .false.
-    logical, private :: print_max_allocated_memory = .false.
-    logical, private :: print_virtual_memory = .false.
-    logical, private :: print_flop_count = .false.
-    logical, private :: print_flop_rate = .false.
-    logical, private :: print_ldst = .false.
-    logical, private :: print_memory_bandwidth = .false.
-    logical, private :: print_ai = .false.
-    integer, private :: bytes_per_ldst = 8
-
-    type(node_t), private, pointer :: root => NULL()             !< Start of graph
-    type(node_t), private, pointer :: current_node => NULL()     !< Current position in the graph
-    contains
-      procedure, pass :: start => timer_start
-      procedure, pass :: stop => timer_stop
-      procedure, pass :: free => timer_free
-      procedure, pass :: print => timer_print
-      procedure, pass :: enable => timer_enable
-      procedure, pass :: disable => timer_disable
-      procedure, pass :: is_enabled => timer_is_enabled
-      procedure, pass :: measure_flops => timer_measure_flops
-      procedure, pass :: measure_allocated_memory => timer_measure_allocated_memory
-      procedure, pass :: measure_virtual_memory => timer_measure_virtual_memory
-      procedure, pass :: measure_max_allocated_memory => timer_measure_max_allocated_memory
-      procedure, pass :: measure_memory_bandwidth => timer_measure_memory_bandwidth
-      procedure, pass :: set_print_options => timer_set_print_options
-      procedure, pass :: in_entries => timer_in_entries
-      procedure, pass :: get => timer_get
-      procedure, pass :: since => timer_since
-      procedure, pass :: sort => timer_sort
-  end type
-
-  ! Private type node_t, representing a graph node
-  !
-  type :: node_t
-    character(len=name_length) :: name             ! Descriptive name, used when printing the timings
-    integer :: count = 0                           ! Number of node_stop calls
-    type(value_t) :: value                         ! The actual counter data, see ftimings_values.F90
-    logical :: is_running = .false.                ! .true. if still running
-    type(node_t), pointer :: firstChild => NULL()
-    type(node_t), pointer :: lastChild => NULL()
-    type(node_t), pointer :: parent => NULL()
-    type(node_t), pointer :: nextSibling => NULL()
-    class(timer_t), pointer :: timer
-    contains
-      procedure, pass :: now => node_now
-      procedure, pass :: start => node_start
-      procedure, pass :: stop => node_stop
-      procedure, pass :: get_value => node_get_value
-      procedure, pass :: new_child => node_new_child
-      procedure, pass :: get_child => node_get_child
-      procedure, pass :: sum_of_children => node_sum_of_children
-      procedure, pass :: sum_of_children_with_name => node_sum_of_children_with_name
-      procedure, pass :: sum_of_children_below => node_sum_of_children_below
-      procedure, pass :: print => node_print
-      procedure, pass :: print_graph => node_print_graph
-      procedure, pass :: sort_children => node_sort_children
-  end type
-
-  interface
-    function microseconds_since_epoch() result(us) bind(C, name="ftimings_microseconds_since_epoch")
-      use, intrinsic :: iso_c_binding
-      implicit none
-      integer(kind=C_INT64_T) :: us
-    end function
-  end interface
-
-#ifdef HAVE_LIBPAPI
-  interface
-    function flop_init() result(ret) bind(C, name="ftimings_flop_init")
-      use, intrinsic :: iso_c_binding
-      implicit none
-      integer(kind=C_INT) :: ret
-    end function
-  end interface
-
-  interface
-    function loads_stores_init() result(ret) bind(C, name="ftimings_loads_stores_init")
-      use, intrinsic :: iso_c_binding
-      implicit none
-      integer(kind=C_INT) :: ret
-    end function
-  end interface
-
-  interface
-    subroutine papi_counters(flops, ldst) bind(C, name="ftimings_papi_counters")
-      use, intrinsic :: iso_c_binding
-      implicit none
-      integer(kind=C_LONG_LONG), intent(out) :: flops, ldst
-    end subroutine
-  end interface
-#endif
-
-  interface
-    function resident_set_size() result(rsssize) bind(C, name="ftimings_resident_set_size")
-      use, intrinsic :: iso_c_binding
-      implicit none
-      integer(kind=C_LONG) :: rsssize
-    end function
-  end interface
-
-  interface
-    function virtual_memory() result(virtualmem) bind(C, name="ftimings_virtual_memory")
-      use, intrinsic :: iso_c_binding
-      implicit none
-      integer(kind=C_LONG) :: virtualmem
-    end function
-  end interface
-
-  interface
-    function max_resident_set_size() result(maxrsssize) bind(C, name="ftimings_highwater_mark")
-      use, intrinsic :: iso_c_binding
-      implicit none
-      integer(kind=C_LONG) :: maxrsssize
-    end function
-  end interface
-
-  contains
-
-  !> Activate the timer, without this, most methods are non-ops.
-  !>
-  subroutine timer_enable(self)
-    class(timer_t), intent(inout), target :: self
-
-    self%active = .true.
-  end subroutine
-
-  !> Call with enabled = .true. to also record amount of newly allocated memory.
-  !> By default, memory usage is not recored. Call with .false. to deactivate again.
-  !>
-  !> This opens /proc/self/statm, parses it, and closes it agagain and is thus
-  !> quite costly, use when appropriate.
-  !>
-  subroutine timer_measure_allocated_memory(self, enabled)
-    class(timer_t), intent(inout) :: self
-    logical, intent(in) :: enabled
-
-    self%record_allocated_memory = enabled
-  end subroutine
-
-  !> Call with enabled = .true. to also record amount of newly created virtual memory.
-  !> By default, memory usage is not recored. Call with .false. to deactivate again.
-  !>
-  !> This opens /proc/self/statm, parses it, and closes it agagain and is thus
-  !> quite costly, use when appropriate.
-  !>
-  subroutine timer_measure_virtual_memory(self, enabled)
-    class(timer_t), intent(inout) :: self
-    logical, intent(in) :: enabled
-
-    self%record_virtual_memory = enabled
-  end subroutine
-
-  !> Call with enabled = .true. to also record amount of newly increase of max.
-  !> resident memory
-  !> By default, memory usage is not recored. Call with .false. to deactivate again.
-  !>
-  !> This opens /proc/self/status, parses it, and closes it agagain and is thus
-  !> quite costly, use when appropriate.
-  !>
-  subroutine timer_measure_max_allocated_memory(self, enabled)
-    class(timer_t), intent(inout) :: self
-    logical, intent(in) :: enabled
-
-    self%record_max_allocated_memory = enabled
-  end subroutine
-
-  !> Call with enabled = .true. to also record the memory bandwidth with PAPI
-  !> By default, this is not recorded. Call with .false. to deactivate again.
-  !>
-  subroutine timer_measure_memory_bandwidth(self, enabled)
-    class(timer_t), intent(inout) :: self
-    logical, intent(in) :: enabled
-
-    if (enabled) then
-#ifdef HAVE_LIBPAPI
-      if (loads_stores_init() == 1) then
-        self%record_memory_bandwidth = .true.
-      else
-        write(0,'(a)') "ftimings: Could not initialize PAPI, disabling memory bandwidth counter"
-        self%record_memory_bandwidth = .false.
-      endif
-#else
-      write(0,'(a)') "ftimings: not compiled with PAPI support, disabling memory bandwidth counter"
-      self%record_memory_bandwidth = .false.
-#endif
-    else
-      ! explicitly set to .false. by caller
-      self%record_memory_bandwidth = .false.
-    endif
-  end subroutine
-
-  !> Call with enabled = .true. to also record FLOP counts via PAPI calls.
-  !> By default no FLOPS are recored. Call with .false. to deactivate again.
-  !>
-  subroutine timer_measure_flops(self, enabled)
-    class(timer_t), intent(inout) :: self
-    logical, intent(in) :: enabled
-
-    if (enabled) then
-#ifdef HAVE_LIBPAPI
-      if (flop_init() == 1) then
-        self%record_flop_counts = .true.
-      else
-        write(0,'(a)') "ftimings: Could not initialize PAPI, disabling FLOP counter"
-        self%record_flop_counts = .false.
-      endif
-#else
-      write(0,'(a)') "ftimings: not compiled with PAPI support, disabling FLOP counter"
-      self%record_flop_counts = .false.
-#endif
-    else
-      ! Explicitly set to .false. by caller
-      self%record_flop_counts = .false.
-    endif
-  end subroutine
-
-  !> Deactivate the timer
-  !>
-  subroutine timer_disable(self)
-    class(timer_t), intent(inout), target :: self
-    self%active = .false.
-  end subroutine
-
-  !> Return whether the timer is currently running
-  !>
-  function timer_is_enabled(self) result(is)
-    class(timer_t), intent(inout), target :: self
-    logical :: is
-    is = self%active
-  end function
-
-  !> Control what to print on following %print calls
-  !>
-  !> \param     print_allocated_memory       Amount of newly allocated,
-  !>                                         resident memory
-  !> \param     print_virtual_memory         Amount of newly created virtual
-  !>                                         memory
-  !> \param     print_max_allocated_memory   Amount of new increase of max.
-  !>                                         resident memory ("high water mark")
-  !> \param     print_flop_count             Number of floating point operations
-  !> \param     print_flop_rate              Rate of floating point operations per second
-  !> \param     print_ldst                   Number of loads+stores
-  !> \param     print_memory_bandwidth       Rate of loads+stores per second
-  !> \param     print_ai                     Arithmetic intensity, that is number of
-  !>                                         floating point operations per
-  !>                                         number of load and store
-  !>                                         operations (currently untested)
-  !> \param     bytes_per_ldst               For calculating the AI, assume this number
-  !>                                         of bytes per load or store (default: 8)
-  subroutine timer_set_print_options(self, &
-	print_allocated_memory, &
-	print_virtual_memory, &
-	print_max_allocated_memory, &
-	print_flop_count, &
-	print_flop_rate, &
-	print_ldst, &
-        print_memory_bandwidth, &
-	print_ai, &
-        bytes_per_ldst)
-    class(timer_t), intent(inout) :: self
-    logical, intent(in), optional :: &
-        print_allocated_memory, &
-        print_virtual_memory, &
-        print_max_allocated_memory, &
-        print_flop_count, &
-        print_flop_rate, &
-        print_ldst, &
-        print_memory_bandwidth, &
-        print_ai
-    integer, intent(in), optional :: bytes_per_ldst
-
-    if (present(print_allocated_memory)) then
-      self%print_allocated_memory = print_allocated_memory
-      if ((.not. self%record_allocated_memory) .and. self%print_allocated_memory) then
-         write(0,'(a)') "ftimings: Warning: RSS size recording was disabled, expect zeros!"
-      endif
-    endif
-
-    if (present(print_virtual_memory)) then
-      self%print_virtual_memory = print_virtual_memory
-      if ((.not. self%record_virtual_memory) .and. self%print_virtual_memory) then
-         write(0,'(a)') "ftimings: Warning: Virtual memory recording was disabled, expect zeros!"
-      endif
-    endif
-
-    if (present(print_max_allocated_memory)) then
-      self%print_max_allocated_memory = print_max_allocated_memory
-      if ((.not. self%record_max_allocated_memory) .and. self%print_max_allocated_memory) then
-         write(0,'(a)') "ftimings: Warning: HWM recording was disabled, expect zeros!"
-      endif
-    endif
-
-    if (present(print_flop_count)) then
-      self%print_flop_count = print_flop_count
-      if ((.not. self%record_flop_counts) .and. self%print_flop_count) then
-         write(0,'(a)') "ftimings: Warning: FLOP counter was disabled, expect zeros!"
-      endif
-    endif
-
-    if (present(print_flop_rate)) then
-      self%print_flop_rate = print_flop_rate
-      if ((.not. self%record_flop_counts) .and. self%print_flop_rate) then
-         write(0,'(a)') "ftimings: Warning: FLOP counter was disabled, expect zeros!"
-      endif
-    endif
-
-    if (present(print_ldst)) then
-      self%print_ldst = print_ldst
-      if ((.not. self%record_memory_bandwidth) .and. self%print_ldst) then
-         write(0,'(a)') "ftimings: Warning: Load+Store counters were disabled, expect zeros!"
-      endif
-    endif
-    if (present(print_memory_bandwidth)) then
-      self%print_memory_bandwidth = print_memory_bandwidth
-      if ((.not. self%record_memory_bandwidth) .and. self%print_memory_bandwidth) then
-         write(0,'(a)') "ftimings: Warning: Load+Store counters were disabled, expect zeros for memory bandwidth!"
-      endif
-    endif
-
-    if (present(print_ai)) then
-      self%print_ai = print_ai
-      if (.not. (self%record_memory_bandwidth .and. self%record_flop_counts)) then
-         write(0,'(a)') "ftimings: Warning: Memory bandwidth or FLOP counters were disabled, expect invalid values for AI"
-      endif
-    endif
-
-    if (present(bytes_per_ldst)) then
-      self%bytes_per_ldst = bytes_per_ldst
-    endif
-  end subroutine
-
-  !> Start a timing section
-  !>
-  !> \param name        A descriptive name
-  !> \param replace     If .true. (default .false.), replace any entries at the
-  !>                    current position with the same name. If .false., add the
-  !>                    time to a possibly existing entry
-  !>
-  !> Care must be taken to balance any invocations of %start() and %stop(), e.g.
-  !> the following is valid
-  !>
-  !> \code{.f90}
-  !>   call timer%start("A")
-  !>   call timer%start("B")
-  !>   call timer%stop("B")
-  !>   call timer%stop("A")
-  !> \endcode
-  !>
-  !> while the following is not
-  !>
-  !> \code{.f90}
-  !>   call timer%start("A")
-  !>   call timer%start("B")
-  !>   call timer%stop("A")
-  !>   call timer%stop("B")
-  !> \endcode
-  !>
-  subroutine timer_start(self, name, replace)
-    class(timer_t), intent(inout), target :: self
-    character(len=*), intent(in)  :: name
-    logical, intent(in), optional  :: replace
-    type(node_t), pointer :: node
-    !$ integer :: omp_get_thread_num, omp_get_num_threads, omp_get_level, omp_get_ancestor_thread_num
-    !$ integer :: i
-
-    if (.not. self%active) then
-      return
-    endif
-
-    ! Deal with nested parallelization
-    !$ do i = 0, omp_get_level()
-    !$   if (omp_get_ancestor_thread_num(i) > 0) then
-    !$     return
-    !$   endif
-    !$ end do
-
-    !$omp master
-
-    if (.not. associated(self%current_node)) then
-      ! First call to timer_start()
-      allocate(self%root)
-      self%root%name = "[Root]"
-      self%root%timer => self
-      call self%root%start()
-      nullify(self%root%firstChild)
-      nullify(self%root%lastChild)
-      nullify(self%root%parent)
-      nullify(self%root%nextSibling)
-      self%current_node => self%root
-    endif
-
-    if (string_eq(self%current_node%name, name)) then
-      !$omp critical
-      write(error_unit,*) "Recursion error! Printing tree so far.."
-      write(error_unit,*) "Got %start(""" // trim(name) // """), while %start(""" // trim(name) // """) was still active"
-      !$ write(*,*) "omp_get_thread_num() = ", omp_get_thread_num()
-      !$ write(*,*) "omp_get_num_threads() = ", omp_get_num_threads()
-      !$ write(*,*) "omp_get_level() = ", omp_get_level()
-      !$ do i = 0, omp_get_level()
-      !$   write(*,*) "omp_get_ancestor_thread_num(", i, ") = ", omp_get_ancestor_thread_num(i)
-      !$ end do
-      call self%root%print_graph(0)
-      !$omp end critical
-      stop "timer_start() while same timer was active"
-    endif
-    node => self%current_node%get_child(name)
-    if (.not. associated(node)) then
-      node => self%current_node%new_child(name)
-    else if (present(replace)) then
-      if (replace) then
-        node%value = null_value
-        node%count = 0
-        if (associated(node%firstChild)) then
-          call deallocate_node(node%firstChild)
-          nullify(node%firstChild)
-          nullify(node%lastChild)
-        endif
-      endif
-    endif
-
-    call node%start()
-
-    self%current_node => node
-
-    !$omp end master
-
-  end subroutine
-
-  !> End a timing segment, \sa timer_start
-  !>
-  !> \param name        The exact same name as was used for %start().
-  !>                    If not provided, close the currently active region.
-  !>                    If given, warns if it does not match the last %start()
-  !>                    call on stderr and disables the current timer instance.
-  !>
-  subroutine timer_stop(self, name)
-    class(timer_t), intent(inout), target :: self
-    character(len=*), intent(in), optional :: name
-    logical :: error
-    !$ integer :: omp_get_level, omp_get_ancestor_thread_num
-    !$ integer :: i
-
-    if (.not. self%active) then
-      return
-    endif
-
-    ! Deal with nested parallelization
-    !$ do i = 0, omp_get_level()
-    !$   if (omp_get_ancestor_thread_num(i) > 0) then
-    !$     return
-    !$   endif
-    !$ end do
-
-    !$omp master
-    error = .false.
-
-    if (.not. associated(self%current_node)) then
-      write(error_unit,'(a)') "Called timer_stop() without first calling any timer_start(), disabling timings"
-      call self%free()
-      self%active = .false.
-      error = .true.
-    else if (present(name)) then
-      if (.not. string_eq(self%current_node%name, name)) then
-        write(error_unit,'(a)') "Expected %stop(""" // trim(self%current_node%name)  // """),&
-                 & but got %stop(""" // trim(name) //  """), disabling timings"
-        call self%free()
-        self%active = .false.
-        error = .true.
-      endif
-    endif
-
-    if (.not. error) then
-      call self%current_node%stop()
-
-      ! climb up to parent
-      if (.not. associated(self%current_node%parent)) then
-        write(error_unit,'(a)') "Error: No valid parent node found for node '" // trim(self%current_node%name) // "'"
-        call self%free()
-        self%active = .false.
-      endif
-      self%current_node => self%current_node%parent
-
-    endif
-    !$omp end master
-
-  end subroutine
-
-  !> Deallocate all objects associated with (but not including) self
-  !>
-  subroutine timer_free(self)
-    class(timer_t), intent(inout), target :: self
-    if (associated(self%root)) then
-      call deallocate_node(self%root)
-    endif
-    nullify(self%root)
-    nullify(self%current_node)
-  end subroutine
-
-  !> Print a timing graph
-  !>
-  !> \param name1       If given, first descend one level to the node with name name1
-  !> \param name2       If given, also descend another level to the node with name2 there
-  !> \param name3       etc.
-  !> \param name4       etc.
-  !> \param threshold   If given, subsume any entries with a value of threshold
-  !>                    seconds in a single node "(below threshold)"
-  !> \param is_sorted   Assume a sorted graph for inserting "(own)" and "(below threshold)"
-  !> \param unit        The unit number on which to print, default stdout
-  !>
-  subroutine timer_print(self, name1, name2, name3, name4, threshold, is_sorted, unit)
-    class(timer_t), intent(in), target :: self
-    character(len=*), intent(in), optional :: name1, name2, name3, name4
-    real(kind=rk), intent(in), optional :: threshold
-    logical, intent(in), optional :: is_sorted
-    integer, intent(in), optional :: unit
-
-    integer :: unit_act
-
-    type(node_t), pointer :: node
-    character(len=64) :: format_spec
-
-    ! I hate fortran's string handling
-    character(len=name_length), parameter :: group = "Group"
-    character(len=12), parameter :: seconds    = "         [s]"
-    character(len=12), parameter :: fract      = "    fraction"
-    character(len=12), parameter :: ram        = "  alloc. RAM"
-    character(len=12), parameter :: vmem       = "   alloc. VM"
-    character(len=12), parameter :: hwm        = "  alloc. HWM"
-    character(len=12), parameter :: flop_rate  = "     Mflop/s"
-    character(len=12), parameter :: flop_count = "       Mflop"
-    character(len=12), parameter :: ldst       = "loads+stores"
-    character(len=12), parameter :: bandwidth  = "  mem bandw."
-    character(len=12), parameter :: ai         = "arithm. Int."
-    character(len=12), parameter :: dash       = "============"
-
-    if (.not. self%active) then
-      return
-    endif
-
-    if (present(unit)) then
-      unit_act = unit
-    else
-      unit_act = output_unit
-    endif
-
-    node => self%root
-    if (present(name1)) then
-      node => node%get_child(name1)
-      if (.not. associated(node)) then
-        write(error_unit,'(a)') "Could not descend to """ // trim(name1)  // """"
-        return
-      endif
-    end if
-    if (present(name2)) then
-      node => node%get_child(name2)
-      if (.not. associated(node)) then
-        write(error_unit,'(a)') "Could not descend to """ // trim(name2)  // """"
-        return
-      endif
-    end if
-    if (present(name3)) then
-      node => node%get_child(name3)
-      if (.not. associated(node)) then
-        write(error_unit,'(a)') "Could not descend to """ // trim(name3)  // """"
-        return
-      endif
-    end if
-    if (present(name4)) then
-      node => node%get_child(name4)
-      if (.not. associated(node)) then
-        write(error_unit,'(a)') "Could not descend to """ // trim(name4)  // """"
-        return
-      endif
-    end if
-
-    ! I really do hate it ..
-    write(format_spec,'("("" /= "",a",i0,",2x,a12,1x,a12)")') name_length
-    write(unit_act, format_spec, advance='no') adjustl(group), seconds, fract
-
-    if (self%print_allocated_memory) then
-      write(unit_act,'(1x,a12)',advance='no') ram
-    endif
-
-    if (self%print_virtual_memory) then
-      write(unit_act,'(1x,a12)',advance='no') vmem
-    endif
-
-    if (self%print_max_allocated_memory) then
-      write(unit_act,'(1x,a12)',advance='no') hwm
-    endif
-
-    if (self%print_flop_count) then
-      write(unit_act,'(1x,a12)',advance='no') flop_count
-    endif
-    if (self%print_flop_rate) then
-      write(unit_act,'(1x,a12)',advance='no') flop_rate
-    endif
-    if (self%print_ldst) then
-      write(unit_act,'(1x,a12)',advance='no') ldst
-    endif
-    if (self%print_memory_bandwidth) then
-      write(unit_act,'(1x,a12)',advance='no') bandwidth
-    endif
-    if (self%print_ai) then
-      write(unit_act,'(1x,a12)',advance='no') ai
-    endif
-
-    write(unit_act,'(a)') ""
-
-    write(format_spec,'("("" |  "",a",i0,",1x,2(1x,a12))")') name_length
-    write(unit_act, format_spec, advance='no') "", dash, dash
-
-    if (self%print_allocated_memory) then
-      write(unit_act,'(1x,a12)',advance='no') dash
-    endif
-
-    if (self%print_virtual_memory) then
-      write(unit_act,'(1x,a12)',advance='no') dash
-    endif
-
-    if (self%print_max_allocated_memory) then
-      write(unit_act,'(1x,a12)',advance='no') dash
-    endif
-
-    if (self%print_flop_count) then
-      write(unit_act,'(1x,a12)',advance='no') dash
-    endif
-    if (self%print_flop_rate) then
-      write(unit_act,'(1x,a12)',advance='no') dash
-    endif
-    if (self%print_ldst) then
-      write(unit_act,'(1x,a12)',advance='no') dash
-    endif
-    if (self%print_memory_bandwidth) then
-      write(unit_act,'(1x,a12)',advance='no') dash
-    endif
-    if (self%print_ai) then
-      write(unit_act,'(1x,a12)',advance='no') dash
-    endif
-
-    write(unit_act,'(a)') ""
-
-    call node%print_graph(0, threshold, is_sorted, unit=unit)
-
-  end subroutine
-
-  !> Return the sum of all entries with a certain name below
-  !> a given node. Specify the name with the last argument, the
-  !> path to the starting point with the first few parameters
-  !>
-  !> \param name1, .., namei-1  The path to the starting node
-  !> \param namei               The name of all sub-entries below this
-  !>                            node which should be summed together
-  !>
-  !> For example timer%in_entries("foo", "bar", "parallel") returns
-  !> the sum of all entries named "parallel" below the foo->bar node
-  !>
-  function timer_in_entries(self, name1, name2, name3, name4) result(s)
-    use, intrinsic :: iso_fortran_env, only : error_unit
-    class(timer_t), intent(in), target :: self
-    character(len=*), intent(in) :: name1
-    character(len=*), intent(in), optional :: name2, name3, name4
-    real(kind=rk) :: s
-    type(node_t), pointer :: node ! the starting node
-    type(value_t) :: val
-    character(len=name_length) :: name ! the name of the sections
-
-    s = 0._rk
-
-    if (.not. self%active) then
-      return
-    endif
-
-    node => self%root
-    name = name1
-
-    if (present(name2)) then
-      node => node%get_child(name1)
-      if (.not. associated(node)) then
-        write(error_unit,'(a)') "Could not descend to """ // trim(name1)  // """"
-        return
-      endif
-      name = name2
-    end if
-    if (present(name3)) then
-      node => node%get_child(name2)
-      if (.not. associated(node)) then
-        write(error_unit,'(a)') "Could not descend to """ // trim(name2)  // """"
-        return
-      endif
-      name = name3
-    end if
-    if (present(name4)) then
-      node => node%get_child(name3)
-      if (.not. associated(node)) then
-        write(error_unit,'(a)') "Could not descend to """ // trim(name3)  // """"
-        return
-      endif
-      name = name4
-    end if
-
-    val = node%sum_of_children_with_name(name)
-    s = real(val%micros, kind=rk) * 1e-6_rk
-  end function
-
-  !> Access a specific, already stopped entry of the graph by specifying the
-  !> names of the nodes along the graph from the root node
-  !>
-  !> The result is only meaningfull if the entry was never appended by
-  !> additional %start() calls.
-  !>
-  function timer_get(self, name1, name2, name3, name4, name5, name6) result(s)
-    class(timer_t), intent(in), target :: self
-    ! this is clunky, but what can you do..
-    character(len=*), intent(in), optional :: name1, name2, name3, name4, name5, name6
-    real(kind=rk) :: s
-    type(node_t), pointer :: node
-
-    s = 0._rk
-
-    if (.not. self%active) then
-      return
-    endif
-
-    node => self%root
-    if (present(name1)) then
-      node => node%get_child(name1)
-      if (.not. associated(node)) then
-        write(error_unit,'(a)') "Could not descend to """ // trim(name1)  // """"
-        return
-      endif
-    end if
-    if (present(name2)) then
-      node => node%get_child(name2)
-      if (.not. associated(node)) then
-        write(error_unit,'(a)') "Could not descend to """ // trim(name2)  // """"
-        return
-      endif
-    end if
-    if (present(name3)) then
-      node => node%get_child(name3)
-      if (.not. associated(node)) then
-        write(error_unit,'(a)') "Could not descend to """ // trim(name3)  // """"
-        return
-      endif
-    end if
-    if (present(name4)) then
-      node => node%get_child(name4)
-      if (.not. associated(node)) then
-        write(error_unit,'(a)') "Could not descend to """ // trim(name4)  // """"
-        return
-      endif
-    end if
-    if (present(name5)) then
-      node => node%get_child(name5)
-      if (.not. associated(node)) then
-        write(error_unit,'(a)') "Could not descend to """ // trim(name5)  // """"
-        return
-      endif
-    end if
-    if (present(name6)) then
-      node => node%get_child(name6)
-      if (.not. associated(node)) then
-        write(error_unit,'(a)') "Could not descend to """ // trim(name6)  // """"
-        return
-      endif
-    end if
-    if (node%is_running) then
-      write(error_unit,'(a)') "Timer """ // trim(node%name) // """ not yet stopped"
-      return
-    endif
-    s = real(node%value%micros, kind=rk) * 1e-6_rk
-  end function
-
-  !> Access a specific, not yet stopped entry of the graph by specifying the
-  !> names of the nodes along the graph from the root node and return the
-  !> seconds that have passed since the entry was created.
-  !>
-  !> The result is only meaningfull if the entry was never appended by
-  !> additional %start() calls.
-  !>
-  function timer_since(self, name1, name2, name3, name4) result(s)
-    class(timer_t), intent(in), target :: self
-    character(len=*), intent(in), optional :: name1, name2, name3, name4
-    real(kind=rk) :: s
-    type(value_t) :: val
-    type(node_t), pointer :: node
-
-    s = 0._rk
-
-    node => self%root
-    if (present(name1)) then
-      node => node%get_child(name1)
-      if (.not. associated(node)) then
-        write(error_unit,'(a)') "Could not descend to """ // trim(name1)  // """"
-        return
-      endif
-    end if
-    if (present(name2)) then
-      node => node%get_child(name2)
-      if (.not. associated(node)) then
-        write(error_unit,'(a)') "Could not descend to """ // trim(name2)  // """"
-        return
-      endif
-    end if
-    if (present(name3)) then
-      node => node%get_child(name3)
-      if (.not. associated(node)) then
-        write(error_unit,'(a)') "Could not descend to """ // trim(name3)  // """"
-        return
-      endif
-    end if
-    if (present(name4)) then
-      node => node%get_child(name4)
-      if (.not. associated(node)) then
-        write(error_unit,'(a)') "Could not descend to """ // trim(name4)  // """"
-        return
-      endif
-    end if
-    if (node%is_running .neqv. .true.) then
-      write(error_unit,'(a)') "Timer """ // trim(node%name) // """ already stopped"
-      return
-    endif
-    val = node%value + node%now()
-    s = real(val%micros, kind=rk) * 1e-6_rk
-  end function
-
-  !> Sort the graph on each level.
-  !> Warning: This irrevocable destroys the old ordering.
-  !>
-  subroutine timer_sort(self)
-    class(timer_t), intent(inout), target :: self
-    type(node_t), pointer :: node
-
-    call sort_nodes(self%root, node)
-
-    node => self%root
-    do while (associated(node))
-      call node%sort_children()
-      node => node%nextSibling
-    enddo
-  end subroutine
-
-
-
-  ! Now methods of node_t:
-
-
-  ! This is the function that actually returns the current timestamp and all other counters
-  function node_now(self) result(val)
-    use, intrinsic :: iso_c_binding
-    class(node_t), intent(in) :: self
-    type(value_t) :: val
-
-    ! current time
-    val%micros = microseconds_since_epoch()
-
-    if (self%timer%record_allocated_memory) then
-      val%rsssize = resident_set_size()
-    endif
-
-    if (self%timer%record_virtual_memory) then
-      val%virtualmem = virtual_memory()
-    endif
-
-    if (self%timer%record_max_allocated_memory) then
-      val%maxrsssize = max_resident_set_size()
-    endif
-
-#ifdef HAVE_LIBPAPI
-    if (self%timer%record_flop_counts .or. self%timer%record_memory_bandwidth) then
-      call papi_counters(val%flop_count, val%ldst)
-    endif
-#endif
-  end function
-
-
-  subroutine node_start(self)
-    class(node_t), intent(inout) :: self
-
-    ! take the time
-    self%value = self%value - self%now()
-    self%is_running = .true.
-  end subroutine
-
-  subroutine node_stop(self)
-    class(node_t), intent(inout) :: self
-
-    self%count = self%count + 1
-
-    ! take the time
-    self%value = self%value + self%now()
-    self%is_running = .false.
-  end subroutine
-
-  function node_get_value(self) result(val)
-    class(node_t), intent(in) :: self
-    type(value_t) :: val
-    val = self%value
-    if (self%is_running) then
-      ! we have not finished, give time up to NOW
-      val = val + self%now()
-    endif
-  end function
-
-  function node_new_child(self, name) result(new)
-    class(node_t), intent(inout), target :: self
-    character(len=*), intent(in) :: name
-    type(node_t), pointer :: new
-
-    if (.not. associated(self%lastChild)) then
-      allocate(self%lastChild)
-      new => self%lastChild
-      self%firstChild => new
-    else
-      allocate(self%lastChild%nextSibling)
-      new => self%lastChild%nextSibling
-      self%lastChild => new
-    endif
-
-    select type (self)
-      type is (node_t)
-        new%parent => self
-      class default
-        stop "node_new_child(): This should not happen"
-    end select
-
-    new%name = name
-    new%count = 0
-    new%timer => self%timer
-
-    nullify(new%firstChild)
-    nullify(new%lastChild)
-    nullify(new%nextSibling)
-  end function
-
-
-  function string_eq(str1, str2) result(eq)
-    character(len=name_length), intent(in) :: str1
-    character(len=*), intent(in) :: str2
-    logical :: eq
-    eq = trim(str1) .eq. str2(1:min(len(trim(str2)), name_length))
-  end function
-
-  function node_get_child(self, name) result(child)
-    class(node_t), intent(in) :: self
-    character(len=*), intent(in) :: name
-    type(node_t), pointer :: child
-
-    child => self%firstChild
-    do while (associated(child))
-      if (string_eq(child%name, name)) then
-        return
-      endif
-      child => child%nextSibling
-    enddo
-    nullify(child)
-  end function
-
-  recursive subroutine deallocate_node(entry)
-    type(node_t), intent(inout), pointer :: entry
-    type(node_t), pointer :: nextSibling
-
-    if (associated(entry%firstChild)) then
-      call deallocate_node(entry%firstChild)
-    endif
-    nextSibling => entry%nextSibling
-    deallocate(entry)
-    nullify(entry)
-    if (associated(nextSibling)) then
-      call deallocate_node(nextSibling)
-    endif
-  end subroutine
-
-  function node_sum_of_children(self) result(sum_time)
-    class(node_t), intent(in) :: self
-    type(node_t), pointer :: cur_entry
-    type(value_t) :: sum_time
-
-    cur_entry => self%firstChild
-    do while (associated(cur_entry))
-      sum_time = sum_time + cur_entry%get_value()
-      cur_entry => cur_entry%nextSibling
-    enddo
-  end function
-
-  recursive function node_sum_of_children_with_name(self, name) result(sum_time)
-    class(node_t), intent(in) :: self
-    character(len=*), intent(in) :: name
-    type(node_t), pointer :: cur_entry
-    type(value_t) :: sum_time
-
-    cur_entry => self%firstChild
-    do while (associated(cur_entry))
-      if (string_eq(cur_entry%name, name)) then
-        sum_time = sum_time + cur_entry%value
-      else
-        sum_time = sum_time + cur_entry%sum_of_children_with_name(name)
-      endif
-      cur_entry => cur_entry%nextSibling
-    enddo
-  end function
-
-  function node_sum_of_children_below(self, threshold) result(sum_time)
-    class(node_t), intent(in) :: self
-    real(kind=rk), intent(in), optional :: threshold
-    type(node_t), pointer :: cur_entry
-    type(value_t) :: sum_time, cur_value
-
-    if (.not. present(threshold)) then
-      return
-    endif
-
-    cur_entry => self%firstChild
-
-    do while (associated(cur_entry))
-      cur_value = cur_entry%get_value()
-      if (cur_value%micros * 1e-6_rk < threshold) then
-        sum_time = sum_time + cur_value
-      endif
-      cur_entry => cur_entry%nextSibling
-    enddo
-  end function
-
-  subroutine insert_into_sorted_list(head, node)
-    type(node_t), pointer, intent(inout) :: head
-    type(node_t), target, intent(inout)    :: node
-    type(node_t), pointer :: cur
-
-    if (node%value%micros >= head%value%micros) then
-      node%nextSibling => head
-      head => node
-      return
-    endif
-
-    cur => head
-    do while (associated(cur%nextSibling))
-      if (cur%value%micros > node%value%micros .and. node%value%micros >= cur%nextSibling%value%micros) then
-        node%nextSibling => cur%nextSibling
-        cur%nextSibling => node
-        return
-      endif
-      cur => cur%nextSibling
-    end do
-
-    ! node has to be appended at the end
-    cur%nextSibling => node
-    node%nextSibling => NULL()
-  end subroutine
-
-  subroutine remove_from_list(head, node)
-    type(node_t), pointer, intent(inout) :: head
-    type(node_t), pointer, intent(in)    :: node
-    type(node_t), pointer :: cur
-
-    if (associated(head,node)) then
-      head => head%nextSibling
-      return
-    endif
-
-    cur => head
-    do while (associated(cur%nextSibling))
-      if (associated(cur%nextSibling,node)) then
-        cur%nextSibling => cur%nextSibling%nextSibling
-        return
-      endif
-      cur => cur%nextSibling
-    end do
-  end subroutine
-
-  subroutine node_print(self, indent_level, total, unit)
-    class(node_t), intent(inout) :: self
-    integer, intent(in) :: indent_level
-    type(value_t), intent(in) :: total
-    type(value_t) :: val
-    integer, intent(in) :: unit
-    character(len=name_length) :: name, suffix
-
-    if (self%is_running) then
-      name = trim(self%name) // " (running)"
-    else
-      name = self%name
-    endif
-
-    if (self%count > 1) then
-      write(suffix, '(" (",i0,"x)")') self%count
-      name = trim(name) // " " // trim(suffix)
-    endif
-
-    if (self%is_running) then
-      val = self%value + self%now()
-    else
-      val = self%value
-    endif
-    call print_value(val, self%timer, indent_level, name, total, unit)
-  end subroutine
-
-  recursive subroutine node_print_graph(self, indent_level, threshold, is_sorted, total, unit)
-    use, intrinsic :: iso_fortran_env, only : output_unit
-    class(node_t), intent(inout) :: self
-    integer, intent(in) :: indent_level
-    real(kind=rk), intent(in), optional :: threshold
-    logical, intent(in), optional :: is_sorted
-    type(value_t), intent(in), optional :: total
-    integer, intent(in), optional :: unit
-
-    type(node_t), pointer :: node
-    integer :: i
-    type(value_t) :: cur_value, node_value, own_value, below_threshold_value, total_act
-    type(node_t), pointer :: own_node, threshold_node
-    real(kind=rk) :: threshold_act
-    logical :: is_sorted_act, print_own, print_threshold
-    integer :: unit_act
-
-    nullify(own_node)
-    nullify(threshold_node)
-
-    if (present(threshold)) then
-      threshold_act = threshold
-    else
-      threshold_act = 0
-    endif
-
-    if (present(is_sorted)) then
-      is_sorted_act = is_sorted
-    else
-      is_sorted_act = .false.
-    endif
-
-    cur_value = self%get_value()
-
-    if (present(total)) then
-      total_act = total
-    else
-      total_act = cur_value
-    endif
-
-    if (present(unit)) then
-      unit_act = unit
-    else
-      unit_act = output_unit
-    endif
-
-    call self%print(indent_level, total_act, unit_act)
-
-    own_value = cur_value - self%sum_of_children()
-    below_threshold_value = self%sum_of_children_below(threshold)
-
-    print_own = associated(self%firstChild)
-    print_threshold = below_threshold_value%micros > 0
-
-    ! Deal with "(own)" and "(below threshold)" entries
-    if (is_sorted_act) then
-      ! sort them in
-      if (print_own) then
-        ! insert an "(own)" node
-        allocate(own_node)
-        own_node%value = own_value
-        own_node%name = own
-        own_node%timer => self%timer
-        call insert_into_sorted_list(self%firstChild, own_node)
-      endif
-
-      if (print_threshold) then
-        ! insert a "(below threshold)" node
-        allocate(threshold_node)
-        threshold_node%value = below_threshold_value
-        threshold_node%name = below
-        threshold_node%timer => self%timer
-        call insert_into_sorted_list(self%firstChild, threshold_node)
-      endif
-
-    else
-      ! print them first
-      if (print_own) then
-        call print_value(own_value, self%timer, indent_level + 1, own, cur_value, unit_act)
-      endif
-      if (print_threshold) then
-        call print_value(below_threshold_value, self%timer, indent_level + 1, below, cur_value, unit_act)
-      endif
-    endif
-
-    ! print children
-    node => self%firstChild
-    do while (associated(node))
-      node_value = node%get_value()
-      if (node_value%micros * 1e-6_rk >= threshold_act &
-                .or. associated(node, threshold_node) &
-                .or. associated(node, own_node)) then
-        call node%print_graph(indent_level + 1, threshold, is_sorted, cur_value, unit_act)
-      endif
-      node => node%nextSibling
-    end do
-
-    if (is_sorted_act) then
-      ! remove inserted dummy nodes again
-      if (print_own) then
-        call remove_from_list(self%firstChild, own_node)
-        deallocate(own_node)
-      endif
-      if (print_threshold) then
-        call remove_from_list(self%firstChild, threshold_node)
-        deallocate(threshold_node)
-      endif
-    endif
-
-  end subroutine
-
-  ! In-place sort a node_t linked list and return the first and last element,
-  subroutine sort_nodes(head, tail)
-    type(node_t), pointer, intent(inout) :: head, tail
-
-    type(node_t), pointer :: p, q, e
-    type(value_t) :: p_val, q_val
-    integer :: insize, nmerges, psize, qsize, i
-
-    if (.not. associated(head)) then
-      nullify(tail)
-      return
-    endif
-
-    insize = 1
-
-    do while (.true.)
-      p => head
-      nullify(head)
-      nullify(tail)
-      nmerges = 0
-
-      do while(associated(p))
-        nmerges = nmerges + 1
-        q => p
-        psize = 0
-        do i = 1, insize
-          psize = psize + 1
-          q => q%nextSibling
-          if (.not. associated(q)) then
-            exit
-          endif
-        end do
-
-        qsize = insize
-
-        do while (psize > 0 .or. (qsize > 0 .and. associated(q)))
-          if (psize == 0) then
-            e => q
-            q => q%nextSibling
-            qsize = qsize - 1
-
-          else if (qsize == 0 .or. (.not. associated(q))) then
-            e => p;
-            p => p%nextSibling
-            psize = psize - 1
-          else
-            p_val = p%get_value()
-            q_val = q%get_value()
-            if (p_val%micros >= q_val%micros) then
-              e => p
-              p => p%nextSibling
-              psize = psize - 1
-
-            else
-              e => q
-              q => q%nextSibling
-              qsize = qsize - 1
-
-            end if
-          end if
-
-          if (associated(tail)) then
-            tail%nextSibling => e
-          else
-            head => e
-          endif
-          tail => e
-
-        end do
-
-        p => q
-
-      end do
-
-      nullify(tail%nextSibling)
-
-      if (nmerges <= 1) then
-        return
-      endif
-
-      insize = insize * 2
-
-    end do
-  end subroutine
-
-
-  recursive subroutine node_sort_children(self)
-    class(node_t), intent(inout) :: self
-    type(node_t), pointer :: node
-
-    call sort_nodes(self%firstChild, self%lastChild)
-
-    node => self%firstChild
-    do while (associated(node))
-      call node%sort_children()
-      node => node%nextSibling
-    enddo
-  end subroutine
-
-  subroutine print_value(value, timer, indent_level, label, total, unit)
-    type(value_t), intent(in) :: value
-    type(timer_t), intent(in) :: timer
-    integer, intent(in) :: indent_level
-    character(len=name_length), intent(in) :: label
-    type(value_t), intent(in) :: total
-    integer, intent(in) :: unit
-
-    character(len=64) :: format_spec
-
-    write(format_spec,'("(",i0,"x,""|_ "",a",i0,",2x,f12.6,1x,f12.3)")') indent_level * 2 + 1, name_length
-    write(unit,format_spec,advance='no') &
-      label, &
-      real(value%micros, kind=rk) * 1e-6_rk, &
-      real(value%micros, kind=rk) / real(total%micros, kind=rk)
-
-    if (timer%print_allocated_memory) then
-      write(unit,'(1x,a12)',advance='no') &
-        nice_format(real(value%rsssize, kind=C_DOUBLE))
-    endif
-
-    if (timer%print_virtual_memory) then
-      write(unit,'(1x,a12)',advance='no') &
-        nice_format(real(value%virtualmem, kind=C_DOUBLE))
-    endif
-
-    if (timer%print_max_allocated_memory) then
-      write(unit,'(1x,a12)',advance='no') &
-        nice_format(real(value%maxrsssize, kind=C_DOUBLE))
-    endif
-
-    if (timer%print_flop_count) then
-      write(unit,'(1x,f12.2)',advance='no') real(value%flop_count, kind=rk) / 1e6_rk
-    endif
-    if (timer%print_flop_rate) then
-      write(unit,'(1x,f12.2)',advance='no') real(value%flop_count, kind=rk) / value%micros
-    endif
-    if (timer%print_ldst) then
-      write(unit,'(1x,a12)',advance='no') nice_format(real(value%ldst, kind=rk))
-    endif
-    if (timer%print_memory_bandwidth) then
-      write(unit,'(1x,a12)',advance='no') nice_format(real(value%ldst*timer%bytes_per_ldst, kind=rk) / (value%micros * 1e-6_rk))
-    endif
-    if (timer%print_ai) then
-      write(unit,'(1x,f12.4)',advance='no') real(value%flop_count, kind=rk) / value%ldst / timer%bytes_per_ldst
-    endif
-
-    write(unit,'(a)') ""
-  end subroutine
-
-  pure elemental function nice_format(number) result(string)
-    real(kind=C_DOUBLE), intent(in) :: number
-    character(len=12) :: string
-    real(kind=C_DOUBLE), parameter :: &
-        kibi = 2.0_C_DOUBLE**10, &
-        mebi = 2.0_C_DOUBLE**20, &
-        gibi = 2.0_C_DOUBLE**30, &
-        tebi = 2.0_C_DOUBLE**40, &
-        pebi = 2.0_C_DOUBLE**50
-
-    if (abs(number) >= pebi) then
-      write(string,'(es12.2)') number
-    else if (abs(number) >= tebi) then
-      write(string,'(f9.2,'' Ti'')') number / tebi
-    else if (abs(number) >= gibi) then
-      write(string,'(f9.2,'' Gi'')') number / gibi
-    else if (abs(number) >= mebi) then
-      write(string,'(f9.2,'' Mi'')') number / mebi
-    else if (abs(number) >= kibi) then
-      write(string,'(f9.2,'' ki'')') number / kibi
-    else
-      write(string,'(f12.2)') number
-    endif
-  end function
-
-
-end module
diff --git a/src/ftimings/ftimings_type.F90 b/src/ftimings/ftimings_type.F90
deleted file mode 100644
index 7f933fbbb..000000000
--- a/src/ftimings/ftimings_type.F90
+++ /dev/null
@@ -1,23 +0,0 @@
-! Copyright 2014 Lorenz Hüdepohl
-!
-! This file is part of ftimings.
-!
-! ftimings is free software: you can redistribute it and/or modify
-! it under the terms of the GNU Lesser General Public License as published by
-! the Free Software Foundation, either version 3 of the License, or
-! (at your option) any later version.
-!
-! ftimings is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-! GNU Lesser General Public License for more details.
-!
-! You should have received a copy of the GNU Lesser General Public License
-! along with ftimings.  If not, see <http://www.gnu.org/licenses/>.
-
-module ftimings_type
-  use, intrinsic :: iso_c_binding, only : C_INT64_T, C_DOUBLE, C_LONG_LONG, C_LONG, C_INT
-  implicit none
-  integer, parameter :: rk = C_DOUBLE
-  integer, parameter :: name_length = 40
-end module
diff --git a/src/ftimings/ftimings_value.F90 b/src/ftimings/ftimings_value.F90
deleted file mode 100644
index e5cc78fee..000000000
--- a/src/ftimings/ftimings_value.F90
+++ /dev/null
@@ -1,93 +0,0 @@
-! Copyright 2014 Lorenz Hüdepohl
-!
-! This file is part of ftimings.
-!
-! ftimings is free software: you can redistribute it and/or modify
-! it under the terms of the GNU Lesser General Public License as published by
-! the Free Software Foundation, either version 3 of the License, or
-! (at your option) any later version.
-!
-! ftimings is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-! GNU Lesser General Public License for more details.
-!
-! You should have received a copy of the GNU Lesser General Public License
-! along with ftimings.  If not, see <http://www.gnu.org/licenses/>.
-
-#ifdef HAVE_CONFIG_H
-#include "config-f90.h"
-#endif
-
-module ftimings_value
-  use ftimings_type
-  implicit none
-  public
-
-  type value_t
-    integer(kind=C_INT64_T) :: micros = 0         ! microseconds spent in this node
-    integer(kind=C_LONG) :: virtualmem = 0        ! newly created virtual memory
-    integer(kind=C_LONG) :: maxrsssize = 0            ! newly used max. resident mem ("high water mark")
-
-    integer(kind=C_LONG) :: rsssize = 0           ! newly used resident memory
-
-    integer(kind=C_LONG_LONG) :: flop_count = 0   ! floating point operations done in this node
-    integer(kind=C_LONG_LONG) :: ldst = 0         ! number of loads and stores
-  end type
-
-  interface operator(+)
-    module procedure value_add
-  end interface
-
-  interface operator(-)
-    module procedure value_minus
-    module procedure value_inverse
-  end interface
-
-  type(value_t), parameter :: null_value = value_t(micros = 0, &
-                                                   rsssize = 0, &
-                                                   virtualmem = 0, &
-                                                   maxrsssize = 0, &
-                                                   flop_count = 0)
-
-  contains
-
-  pure elemental function value_add(a,b) result(c)
-    class(value_t), intent(in) :: a, b
-    type(value_t) :: c
-    c%micros = a%micros + b%micros
-    c%rsssize = a%rsssize + b%rsssize
-    c%virtualmem = a%virtualmem + b%virtualmem
-    c%maxrsssize = a%maxrsssize + b%maxrsssize
-#ifdef HAVE_LIBPAPI
-    c%flop_count = a%flop_count + b%flop_count
-    c%ldst = a%ldst + b%ldst
-#endif
-  end function
-
-  pure elemental function value_minus(a,b) result(c)
-    class(value_t), intent(in) :: a, b
-    type(value_t) :: c
-    c%micros = a%micros - b%micros
-    c%rsssize = a%rsssize - b%rsssize
-    c%virtualmem = a%virtualmem - b%virtualmem
-    c%maxrsssize = a%maxrsssize - b%maxrsssize
-#ifdef HAVE_LIBPAPI
-    c%flop_count = a%flop_count - b%flop_count
-    c%ldst = a%ldst - b%ldst
-#endif
-  end function
-
-  pure elemental function value_inverse(a) result(neg_a)
-    class(value_t), intent(in) :: a
-    type(value_t) :: neg_a
-    neg_a%micros = - a%micros
-    neg_a%rsssize = - a%rsssize
-    neg_a%virtualmem = - a%virtualmem
-    neg_a%maxrsssize = - a%maxrsssize
-#ifdef HAVE_LIBPAPI
-    neg_a%flop_count = - a%flop_count
-    neg_a%ldst = - a%ldst
-#endif
-  end function
-end module
diff --git a/src/ftimings/highwater_mark.c b/src/ftimings/highwater_mark.c
deleted file mode 100644
index 680e5c1c0..000000000
--- a/src/ftimings/highwater_mark.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2014 Andreas Marek, Lorenz Hüdepohl
- *
- * This file is part of ftimings.
- *
- * ftimings is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ftimings is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with ftimings.  If not, see <http://www.gnu.org/licenses/>.
- */
-#include <sys/types.h>
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <string.h>
-
-long ftimings_highwater_mark() {
-	long hwm = 0L;
-	char line[1024];
-	FILE* fp = NULL;
-
-	if ((fp = fopen( "/proc/self/status", "r" )) == NULL ) {
-		return 0L;
-	}
-
-	/* Read memory size data from /proc/pid/status */
-	while(fgets(line, sizeof line, fp)) {
-		if (sscanf(line, "VmHWM: %ld kB", &hwm) == 1) {
-			break;
-		}
-	}
-	fclose(fp);
-
-	return hwm * 1024L;
-}
diff --git a/src/ftimings/papi.c b/src/ftimings/papi.c
deleted file mode 100644
index fe18cc8cf..000000000
--- a/src/ftimings/papi.c
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright 2014 Lorenz Hüdepohl
- *
- * This file is part of ftimings.
- *
- * ftimings is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ftimings is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with ftimings.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-static int event_set;
-
-static int tried_papi_init = 0;
-static int papi_available = 0;
-static int flops_available = 0;
-static int ldst_available = 0;
-
-#ifdef HAVE_LIBPAPI
-#include <papi.h>
-
-int ftimings_papi_init(void) {
-	int ret;
-
-	if (tried_papi_init) {
-		return papi_available;
-	}
-
-#pragma omp critical
-	{
-		/* Think about it :) */
-		if (tried_papi_init) {
-			goto end;
-		}
-
-		tried_papi_init = 1;
-
-		event_set = PAPI_NULL;
-
-		if ((ret = PAPI_library_init(PAPI_VER_CURRENT)) < 0) {
-			fprintf(stderr, "ftimings: %s:%d: PAPI_library_init(%d): %s\n",
-					__FILE__, __LINE__, PAPI_VER_CURRENT, PAPI_strerror(ret));
-			goto error;
-		}
-
-		if ((ret = PAPI_create_eventset(&event_set)) < 0) {
-			fprintf(stderr, "ftimings: %s:%d PAPI_create_eventset(): %s\n",
-					__FILE__, __LINE__, PAPI_strerror(ret));
-			goto error;
-		}
-
-		/* Check FLOP counter availability */
-		if ((ret = PAPI_query_event(PAPI_DP_OPS)) < 0) {
-			fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_DP_OPS): %s\n",
-					__FILE__, __LINE__, PAPI_strerror(ret));
-			flops_available = 0;
-		} else if ((ret = PAPI_add_event(event_set, PAPI_DP_OPS)) < 0) {
-			fprintf(stderr, "ftimings: %s:%d PAPI_add_event(): %s\n",
-					__FILE__, __LINE__, PAPI_strerror(ret));
-			flops_available = 0;
-		} else {
-			flops_available = 1;
-		}
-
-		/* Loads + Stores */
-		if ((ret = PAPI_query_event(PAPI_LD_INS)) < 0) {
-			fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_LD_INS): %s\n",
-					__FILE__, __LINE__, PAPI_strerror(ret));
-			ldst_available = 0;
-		} else if ((ret = PAPI_query_event(PAPI_SR_INS)) < 0) {
-			fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_SR_INS): %s\n",
-					__FILE__, __LINE__, PAPI_strerror(ret));
-			ldst_available = 0;
-		} else if ((ret = PAPI_add_event(event_set, PAPI_LD_INS)) < 0) {
-			fprintf(stderr, "ftimings: %s:%d PAPI_add_event(event_set, PAPI_LD_INS): %s\n",
-					__FILE__, __LINE__, PAPI_strerror(ret));
-			ldst_available = 0;
-		} else if ((ret = PAPI_add_event(event_set, PAPI_SR_INS)) < 0) {
-			fprintf(stderr, "ftimings: %s:%d PAPI_add_event(event_set, PAPI_SR_INS): %s\n",
-					__FILE__, __LINE__, PAPI_strerror(ret));
-			ldst_available = 0;
-		} else {
-			ldst_available = 1;
-		}
-
-		/* Start */
-		if ((ret = PAPI_start(event_set)) < 0) {
-			fprintf(stderr, "ftimings: %s:%d PAPI_start(): %s\n",
-					__FILE__, __LINE__, PAPI_strerror(ret));
-			goto error;
-		}
-
-		goto end;
-
-error:
-		/* PAPI works */
-		papi_available = 0;
-
-end:
-		/* PAPI works */
-		papi_available = 1;
-
-	} /* End of critical region */
-
-	return papi_available;
-}
-
-int ftimings_flop_init(void) {
-	int ret;
-
-	if (!tried_papi_init) {
-		ftimings_papi_init();
-	}
-
-	return flops_available;
-}
-
-int ftimings_loads_stores_init(void) {
-	int ret;
-
-	if (!tried_papi_init) {
-		ftimings_papi_init();
-	}
-
-	return ldst_available;
-}
-
-void ftimings_papi_counters(long long *flops, long long *ldst) {
-	long long res[3];
-	int i, ret;
-
-	if ((ret = PAPI_read(event_set, &res[0])) < 0) {
-		fprintf(stderr, "PAPI_read: %s\n", PAPI_strerror(ret));
-		exit(1);
-	}
-
-	i = 0;
-	if (flops_available) {
-		*flops = res[i++];
-	} else {
-		*flops = 0LL;
-	}
-	if (ldst_available) {
-		*ldst = res[i++];
-		*ldst += res[i++];
-	} else {
-		*ldst = 0LL;
-	}
-}
-#endif
diff --git a/src/ftimings/resident_set_size.c b/src/ftimings/resident_set_size.c
deleted file mode 100644
index 4148687e0..000000000
--- a/src/ftimings/resident_set_size.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2014 Lorenz Hüdepohl
- *
- * This file is part of ftimings.
- *
- * ftimings is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ftimings is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with ftimings.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <stdio.h>
-#include <unistd.h>
-
-long ftimings_resident_set_size() {
-	long rss = 0L;
-	FILE* fp = NULL;
-	if ((fp = fopen( "/proc/self/statm", "r" )) == NULL ) {
-		return 0L;
-	}
-	if (fscanf(fp, "%*s%ld", &rss) != 1) {
-		fclose(fp);
-		return (size_t)0L;	  /* Can't read? */
-	}
-	fclose(fp);
-	return rss * sysconf( _SC_PAGESIZE);
-}
diff --git a/src/ftimings/time.c b/src/ftimings/time.c
deleted file mode 100644
index 638366fa3..000000000
--- a/src/ftimings/time.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright 2014 Lorenz Hüdepohl
- *
- * This file is part of ftimings.
- *
- * ftimings is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ftimings is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with ftimings.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <sys/time.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <stddef.h>
-#include <stdlib.h>
-
-#ifdef HAVE_CONFIG_H
-#include "config-f90.h"
-#endif
-
-/* Return number of microseconds since 1.1.1970, in a 64 bit integer.
- * (with 2^64 us ~ 6 * 10^5 years, this should be sufficiently overflow safe)
- */
-int64_t ftimings_microseconds_since_epoch(void) {
-	struct timeval tv;
-	if (gettimeofday(&tv, NULL) != 0) {
-		perror("gettimeofday");
-		exit(1);
-	}
-	return (int64_t) (tv.tv_sec) * ((int64_t) 1000000) + (int64_t)(tv.tv_usec);
-}
diff --git a/src/ftimings/virtual_memory.c b/src/ftimings/virtual_memory.c
deleted file mode 100644
index 90b09dbc1..000000000
--- a/src/ftimings/virtual_memory.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2014 Andreas Marek, Lorenz Hüdepohl
- *
- * This file is part of ftimings.
- *
- * ftimings is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ftimings is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with ftimings.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <stdio.h>
-#include <unistd.h>
-
-long ftimings_virtual_memory() {
-
-	long rss = 0L;
-	FILE* fp = NULL;
-	if ((fp = fopen( "/proc/self/statm", "r" )) == NULL ) {
-		return 0L;
-	}
-	if (fscanf(fp, "%ld", &rss) != 1) {
-		fclose(fp);
-		return (size_t)0L;	  /* Can't read? */
-	}
-	fclose(fp);
-	return rss * sysconf( _SC_PAGESIZE);
-}
diff --git a/src/mod_compute_hh_trafo_complex.F90 b/src/mod_compute_hh_trafo_complex.F90
deleted file mode 100644
index fe9b4f00e..000000000
--- a/src/mod_compute_hh_trafo_complex.F90
+++ /dev/null
@@ -1,260 +0,0 @@
-module compute_hh_trafo_complex
-#include "config-f90.h"
-  implicit none
-
-#ifdef WITH_OPENMP
-  public compute_hh_trafo_complex_cpu_openmp
-#else
-  public compute_hh_trafo_complex_cpu
-#endif
-
-  include 'mpif.h'
-
-  contains
-
-#ifdef WITH_OPENMP
-         subroutine compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads,                &
-                                                        a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-                                                        off, ncols, istripe,                                               &
-                                                        my_thread, THIS_COMPLEX_ELPA_KERNEL)
-#else
-         subroutine compute_hh_trafo_complex_cpu       (a, stripe_width, a_dim2, stripe_count,                             &
-                                                        a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-                                                        off, ncols, istripe, last_stripe_width,                            &
-                                                        THIS_COMPLEX_ELPA_KERNEL)
-#endif
-           use precision
-           use elpa2_utilities
-#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
-           use complex_generic_simple_kernel, only : single_hh_trafo_complex_generic_simple
-#endif
-#if defined(WITH_COMPLEX_GENERIC_KERNEL)
-           use complex_generic_kernel, only : single_hh_trafo_complex_generic
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
-           use timings
-#endif
-           implicit none
-           real(kind=rk), intent(inout) :: kernel_time
-           integer(kind=lik)            :: kernel_flops
-           integer(kind=ik), intent(in) :: nbw, max_blk_size
-           complex(kind=ck)             :: bcast_buffer(nbw,max_blk_size)
-           integer(kind=ik), intent(in) :: a_off
-
-           integer(kind=ik), intent(in) :: stripe_width, a_dim2, stripe_count
-#ifndef WITH_OPENMP
-           integer(kind=ik), intent(in) :: last_stripe_width
-           complex(kind=ck)             :: a(stripe_width,a_dim2,stripe_count)
-#else
-           integer(kind=ik), intent(in) :: max_threads
-           complex(kind=ck)             :: a(stripe_width,a_dim2,stripe_count,max_threads)
-#endif
-           integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL
-
-           ! Private variables in OMP regions (my_thread) should better be in the argument list!
-
-           integer(kind=ik)             :: off, ncols, istripe, j, nl, jj
-#ifdef WITH_OPENMP
-           integer(kind=ik)             :: my_thread, noff
-#endif
-           real(kind=rk)                :: ttt
-
-           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-           !        Currently (on Sandy Bridge), single is faster than double
-           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-           complex(kind=ck)             :: w(nbw,2)
-
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-          call timer%stop("compute_hh_trafo_complex_cpu_openmp")
-#else
-          call timer%stop("compute_hh_trafo_complex_cpu")
-#endif
-#endif
-
-#ifdef WITH_OPENMP
-           if (istripe<stripe_count) then
-             nl = stripe_width
-           else
-             noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
-             nl = min(my_thread*thread_width-noff, l_nev-noff)
-             if(nl<=0) then
-#ifdef WITH_OPENMP
-               call timer%stop("compute_hh_trafo_complex_cpu_openmp")
-#else
-               call timer%stop("compute_hh_trafo_complex_cpu")
-#endif
-               return
-             endif
-           endif
-#else
-           nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
-#endif
-
-#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) then
-#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-             ttt = mpi_wtime()
-             do j = ncols, 2, -2
-               w(:,1) = bcast_buffer(1:nbw,j+off)
-               w(:,2) = bcast_buffer(1:nbw,j+off-1)
-#ifdef WITH_OPENMP
-               call double_hh_trafo_complex_sse_avx_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
-                                                       w, nbw, nl, stripe_width, nbw)
-#else
-               call double_hh_trafo_complex_sse_avx_2hv(a(1,j+off+a_off-1,istripe), &
-                                                       w, nbw, nl, stripe_width, nbw)
-#endif
-             enddo
-#ifdef WITH_OPENMP
-             if (j==1) call single_hh_trafo_complex_sse_avx_1hv(a(1,1+off+a_off,istripe,my_thread), &
-                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
-#else
-             if (j==1) call single_hh_trafo_complex_sse_avx_1hv(a(1,1+off+a_off,istripe), &
-                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
-#endif
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-           endif
-#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-#endif /* WITH_COMPLEX_AVX_BLOCK2_KERNEL */
-
-
-#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-            if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) then
-#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-             ttt = mpi_wtime()
-             do j = ncols, 1, -1
-#ifdef WITH_OPENMP
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-               call single_hh_trafo_complex_generic_simple(a(1,j+off+a_off,istripe,my_thread), &
-                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
-#else
-               call single_hh_trafo_complex_generic_simple(a(1:stripe_width,j+off+a_off:j+off_a_off+nbw-1,istripe,my_thread), &
-                                                           bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
-#endif
-
-#else /* WITH_OPENMP */
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-               call single_hh_trafo_complex_generic_simple(a(1,j+off+a_off,istripe), &
-                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
-#else
-               call single_hh_trafo_complex_generic_simple(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), &
-                                                          bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
-#endif
-
-#endif /* WITH_OPENMP */
-             enddo
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-           endif
-#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */
-
-
-#if defined(WITH_COMPLEX_GENERIC_KERNEL)
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC .or. &
-               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGP .or. &
-               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGQ ) then
-#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-             ttt = mpi_wtime()
-             do j = ncols, 1, -1
-#ifdef WITH_OPENMP
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-
-              call single_hh_trafo_complex_generic(a(1,j+off+a_off,istripe,my_thread), &
-                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
-#else
-              call single_hh_trafo_complex_generic(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
-                                                   bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
-#endif
-
-#else /* WITH_OPENMP */
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-              call single_hh_trafo_complex_generic(a(1,j+off+a_off,istripe), &
-                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
-#else
-              call single_hh_trafo_complex_generic(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), &
-                                                   bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
-#endif
-#endif /* WITH_OPENMP */
-
-            enddo
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-          endif
-#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-#endif /* WITH_COMPLEX_GENERIC_KERNEL */
-
-#if defined(WITH_COMPLEX_SSE_KERNEL)
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) then
-#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-             ttt = mpi_wtime()
-             do j = ncols, 1, -1
-#ifdef WITH_OPENMP
-              call single_hh_trafo_complex(a(1,j+off+a_off,istripe,my_thread), &
-                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
-#else
-              call single_hh_trafo_complex(a(1,j+off+a_off,istripe), &
-                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
-#endif
-            enddo
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-          endif
-#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-#endif /* WITH_COMPLEX_SSE_KERNEL */
-
-
-!#if defined(WITH_AVX_SANDYBRIDGE)
-!              call single_hh_trafo_complex_sse_avx_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#endif
-
-!#if defined(WITH_AMD_BULLDOZER)
-!              call single_hh_trafo_complex_sse_avx_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#endif
-
-#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-          if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) then
-#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-            ttt = mpi_wtime()
-            do j = ncols, 1, -1
-#ifdef WITH_OPENMP
-              call single_hh_trafo_complex_sse_avx_1hv(a(1,j+off+a_off,istripe,my_thread), &
-                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
-#else
-              call single_hh_trafo_complex_sse_avx_1hv(a(1,j+off+a_off,istripe), &
-                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
-#endif
-            enddo
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-          endif
-#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNE */
-
-#ifdef WITH_OPENMP
-          if (my_thread==1) then
-#endif
-            kernel_flops = kernel_flops + 4*4*int(nl,8)*int(ncols,8)*int(nbw,8)
-            kernel_time  = kernel_time + mpi_wtime()-ttt
-#ifdef WITH_OPENMP
-          endif
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-          call timer%stop("compute_hh_trafo_complex_cpu_openmp")
-#else
-          call timer%stop("compute_hh_trafo_complex_cpu")
-#endif
-#endif
-
-#ifdef WITH_OPENM
-        end subroutine compute_hh_trafo_complex_cpu_openmp
-#else
-        end subroutine compute_hh_trafo_complex_cpu
-
-#endif
-
-end module
diff --git a/src/mod_compute_hh_trafo_real.F90 b/src/mod_compute_hh_trafo_real.F90
deleted file mode 100644
index 9bf517c0a..000000000
--- a/src/mod_compute_hh_trafo_real.F90
+++ /dev/null
@@ -1,417 +0,0 @@
-module compute_hh_trafo_real
-#include "config-f90.h"
-  implicit none
-
-#ifdef WITH_OPENMP
-  public compute_hh_trafo_real_cpu_openmp
-#else
-  public compute_hh_trafo_real_cpu
-#endif
-
-  include 'mpif.h'
-
-  contains
-
-#ifdef WITH_OPENMP
-       subroutine compute_hh_trafo_real_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads,                &
-                                                   a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-                                                   off, ncols, istripe,                                               &
-                                                   my_thread, THIS_REAL_ELPA_KERNEL)
-#else
-       subroutine compute_hh_trafo_real_cpu       (a, stripe_width,a_dim2,stripe_count,                                &
-                                                   a_off, nbw, max_blk_size, bcast_buffer,  kernel_flops, kernel_time, &
-                                                   off, ncols, istripe, last_stripe_width,                             &
-                                                   THIS_REAL_ELPA_KERNEL)
-#endif
-
-
-         use precision
-         use elpa2_utilities
-         use single_hh_trafo_real
-#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
-         use real_generic_simple_kernel, only : double_hh_trafo_generic_simple
-#endif
-
-#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(DESPERATELY_WANT_ASSUMED_SIZE))
-        use real_generic_kernel, only : double_hh_trafo_generic
-#endif
-
-#if defined(WITH_REAL_BGP_KERNEL)
-         use real_bgp_kernel, only : double_hh_trafo_bgp
-#endif
-
-#if defined(WITH_REAL_BGQ_KERNEL)
-         use real_bgq_kernel, only : double_hh_trafo_bgq
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
-         use timings
-#endif
-         implicit none
-         include "mpif.h"
-         real(kind=rk), intent(inout) :: kernel_time
-         integer(kind=lik)            :: kernel_flops
-         integer(kind=ik), intent(in) :: nbw, max_blk_size
-         real(kind=rk)                :: bcast_buffer(nbw,max_blk_size)
-         integer(kind=ik), intent(in) :: a_off
-
-         integer(kind=ik), intent(in) :: stripe_width,a_dim2,stripe_count
-
-#ifndef WITH_OPENMP
-         integer(kind=ik), intent(in) :: last_stripe_width
-         real(kind=rk)                :: a(stripe_width,a_dim2,stripe_count)
-#else
-         integer(kind=ik), intent(in) :: max_threads
-         real(kind=rk)                :: a(stripe_width,a_dim2,stripe_count,max_threads)
-#endif
-         integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL
-
-         ! Private variables in OMP regions (my_thread) should better be in the argument list!
-         integer(kind=ik)             :: off, ncols, istripe
-#ifdef WITH_OPENMP
-         integer(kind=ik)             :: my_thread, noff
-#endif
-         integer(kind=ik)             :: j, nl, jj, jjj
-         real(kind=rk)                :: w(nbw,6), ttt
-
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-         call timer%start("compute_hh_trafo_real_cpu_openmp")
-#else
-         call timer%start("compute_hh_trafo_real_cpu")
-#endif
-#endif
-         ttt = mpi_wtime()
-
-#ifndef WITH_OPENMP
-         nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
-#else
-
-         if (istripe<stripe_count) then
-           nl = stripe_width
-         else
-           noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
-           nl = min(my_thread*thread_width-noff, l_nev-noff)
-           if (nl<=0) then
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-             call timer%stop("compute_hh_trafo_real_cpu_openmp")
-#else
-             call timer%stop("compute_hh_trafo_real_cpu")
-#endif
-#endif
-             return
-           endif
-         endif
-#endif
-
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2 .or. &
-             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC    .or. &
-             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE .or. &
-             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE .or.        &
-             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGP .or.        &
-             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGQ) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-
-           !FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
-#if defined(WITH_REAL_GENERIC_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-
-             do j = ncols, 2, -2
-               w(:,1) = bcast_buffer(1:nbw,j+off)
-               w(:,2) = bcast_buffer(1:nbw,j+off-1)
-
-#ifdef WITH_OPENMP
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-               call double_hh_trafo_generic(a(1,j+off+a_off-1,istripe,my_thread), w, &
-                                            nbw, nl, stripe_width, nbw)
-
-#else
-               call double_hh_trafo_generic(a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, &
-                                              istripe,my_thread), w(1:nbw,1:6), &
-                                              nbw, nl, stripe_width, nbw)
-#endif
-
-#else /* WITH_OPENMP */
-
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-               call double_hh_trafo_generic(a(1,j+off+a_off-1,istripe),w, &
-                                            nbw, nl, stripe_width, nbw)
-
-#else
-               call double_hh_trafo_generic(a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), &
-                                            nbw, nl, stripe_width, nbw)
-#endif
-#endif /* WITH_OPENMP */
-
-             enddo
-
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_GENERIC_KERNEL */
-
-
-#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-             do j = ncols, 2, -2
-               w(:,1) = bcast_buffer(1:nbw,j+off)
-               w(:,2) = bcast_buffer(1:nbw,j+off-1)
-#ifdef WITH_OPENMP
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-               call double_hh_trafo_generic_simple(a(1,j+off+a_off-1,istripe,my_thread), &
-                                                     w, nbw, nl, stripe_width, nbw)
-#else
-               call double_hh_trafo_generic_simple(a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), &
-                                                     w, nbw, nl, stripe_width, nbw)
-
-#endif
-
-#else /* WITH_OPENMP */
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-               call double_hh_trafo_generic_simple(a(1,j+off+a_off-1,istripe), &
-                                                     w, nbw, nl, stripe_width, nbw)
-#else
-               call double_hh_trafo_generic_simple(a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), &
-                                                     w, nbw, nl, stripe_width, nbw)
-
-#endif
-
-#endif /* WITH_OPENMP */
-
-             enddo
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_GENERIC_SIMPLE_KERNEL */
-
-
-#if defined(WITH_REAL_SSE_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-             do j = ncols, 2, -2
-               w(:,1) = bcast_buffer(1:nbw,j+off)
-               w(:,2) = bcast_buffer(1:nbw,j+off-1)
-#ifdef WITH_OPENMP
-               call double_hh_trafo(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
-                                      stripe_width, nbw)
-#else
-               call double_hh_trafo(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
-                                      stripe_width, nbw)
-#endif
-             enddo
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_SSE_KERNEL */
-
-
-#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-             do j = ncols, 2, -2
-               w(:,1) = bcast_buffer(1:nbw,j+off)
-               w(:,2) = bcast_buffer(1:nbw,j+off-1)
-#ifdef WITH_OPENMP
-               call double_hh_trafo_real_sse_avx_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
-                                                       w, nbw, nl, stripe_width, nbw)
-#else
-               call double_hh_trafo_real_sse_avx_2hv(a(1,j+off+a_off-1,istripe), &
-                                                       w, nbw, nl, stripe_width, nbw)
-#endif
-             enddo
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_AVX_BLOCK2_KERNEL */
-
-#if defined(WITH_REAL_BGP_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGP) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-             do j = ncols, 2, -2
-               w(:,1) = bcast_buffer(1:nbw,j+off)
-               w(:,2) = bcast_buffer(1:nbw,j+off-1)
-#ifdef WITH_OPENMP
-               call double_hh_trafo_bgp(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
-                                          stripe_width, nbw)
-#else
-               call double_hh_trafo_bgp(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
-                                          stripe_width, nbw)
-#endif
-             enddo
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_BGP_KERNEL */
-
-
-#if defined(WITH_REAL_BGQ_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGQ) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-             do j = ncols, 2, -2
-               w(:,1) = bcast_buffer(1:nbw,j+off)
-               w(:,2) = bcast_buffer(1:nbw,j+off-1)
-#ifdef WITH_OPENMP
-               call double_hh_trafo_bgq(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
-                                          stripe_width, nbw)
-#else
-               call double_hh_trafo_bgq(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
-                                          stripe_width, nbw)
-#endif
-             enddo
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_BGQ_KERNEL */
-
-
-!#if defined(WITH_AVX_SANDYBRIDGE)
-!              call double_hh_trafo_real_sse_avx_2hv(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
-!#endif
-
-#ifdef WITH_OPENMP
-           if (j==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off_a_off+nbw-1,istripe,my_thread), &
-                                      bcast_buffer(1:nbw,off+1), nbw, nl,     &
-                                      stripe_width)
-#else
-           if (j==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe),           &
-                                      bcast_buffer(1:nbw,off+1), nbw, nl,     &
-                                      stripe_width)
-#endif
-
-
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-         endif !
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-
-
-
-#if defined(WITH_REAL_AVX_BLOCK4_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-           ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
-           do j = ncols, 4, -4
-             w(:,1) = bcast_buffer(1:nbw,j+off)
-             w(:,2) = bcast_buffer(1:nbw,j+off-1)
-             w(:,3) = bcast_buffer(1:nbw,j+off-2)
-             w(:,4) = bcast_buffer(1:nbw,j+off-3)
-#ifdef WITH_OPENMP
-             call quad_hh_trafo_real_sse_avx_4hv(a(1,j+off+a_off-3,istripe,my_thread), w, &
-                                                  nbw, nl, stripe_width, nbw)
-#else
-             call quad_hh_trafo_real_sse_avx_4hv(a(1,j+off+a_off-3,istripe), w, &
-                                                  nbw, nl, stripe_width, nbw)
-#endif
-           enddo
-           do jj = j, 2, -2
-             w(:,1) = bcast_buffer(1:nbw,jj+off)
-             w(:,2) = bcast_buffer(1:nbw,jj+off-1)
-#ifdef WITH_OPENMP
-             call double_hh_trafo_real_sse_avx_2hv(a(1,jj+off+a_off-1,istripe,my_thread), &
-                                                    w, nbw, nl, stripe_width, nbw)
-#else
-             call double_hh_trafo_real_sse_avx_2hv(a(1,jj+off+a_off-1,istripe), &
-                                                    w, nbw, nl, stripe_width, nbw)
-#endif
-           enddo
-#ifdef WITH_OPENMP
-           if (jj==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
-                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
-#else
-           if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
-                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
-#endif
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-         endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_AVX_BLOCK4_KERNEL */
-
-
-#if defined(WITH_REAL_AVX_BLOCK6_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK6) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-           ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
-           do j = ncols, 6, -6
-             w(:,1) = bcast_buffer(1:nbw,j+off)
-             w(:,2) = bcast_buffer(1:nbw,j+off-1)
-             w(:,3) = bcast_buffer(1:nbw,j+off-2)
-             w(:,4) = bcast_buffer(1:nbw,j+off-3)
-             w(:,5) = bcast_buffer(1:nbw,j+off-4)
-             w(:,6) = bcast_buffer(1:nbw,j+off-5)
-#ifdef WITH_OPENMP
-             call hexa_hh_trafo_real_sse_avx_6hv(a(1,j+off+a_off-5,istripe,my_thread), w, &
-                                                  nbw, nl, stripe_width, nbw)
-#else
-             call hexa_hh_trafo_real_sse_avx_6hv(a(1,j+off+a_off-5,istripe), w, &
-                                                  nbw, nl, stripe_width, nbw)
-#endif
-           enddo
-           do jj = j, 4, -4
-             w(:,1) = bcast_buffer(1:nbw,jj+off)
-             w(:,2) = bcast_buffer(1:nbw,jj+off-1)
-             w(:,3) = bcast_buffer(1:nbw,jj+off-2)
-             w(:,4) = bcast_buffer(1:nbw,jj+off-3)
-#ifdef WITH_OPENMP
-             call quad_hh_trafo_real_sse_avx_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, &
-                                                  nbw, nl, stripe_width, nbw)
-#else
-             call quad_hh_trafo_real_sse_avx_4hv(a(1,jj+off+a_off-3,istripe), w, &
-                                                  nbw, nl, stripe_width, nbw)
-#endif
-           enddo
-           do jjj = jj, 2, -2
-             w(:,1) = bcast_buffer(1:nbw,jjj+off)
-             w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
-#ifdef WITH_OPENMP
-             call double_hh_trafo_real_sse_avx_2hv(a(1,jjj+off+a_off-1,istripe,my_thread), &
-                                                    w, nbw, nl, stripe_width, nbw)
-#else
-             call double_hh_trafo_real_sse_avx_2hv(a(1,jjj+off+a_off-1,istripe), &
-                                                    w, nbw, nl, stripe_width, nbw)
-#endif
-           enddo
-#ifdef WITH_OPENMP
-           if (jjj==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
-                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
-#else
-           if (jjj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
-                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
-#endif
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-         endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_AVX_BLOCK4_KERNEL */
-
-#ifdef WITH_OPENMP
-         if (my_thread==1) then
-#endif
-           kernel_flops = kernel_flops + 4*int(nl,8)*int(ncols,8)*int(nbw,8)
-           kernel_time = kernel_time + mpi_wtime()-ttt
-#ifdef WITH_OPENMP
-         endif
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-         call timer%stop("compute_hh_trafo_real_cpu_openmp")
-#else
-         call timer%stop("compute_hh_trafo_real_cpu")
-#endif
-#endif
-
-#ifdef WITH_OPENMP
-       end subroutine compute_hh_trafo_real_cpu_openmp
-#else
-       end subroutine compute_hh_trafo_real_cpu
-#endif
-
-end module
diff --git a/src/mod_pack_unpack_complex.F90 b/src/mod_pack_unpack_complex.F90
deleted file mode 100644
index b51a9c38d..000000000
--- a/src/mod_pack_unpack_complex.F90
+++ /dev/null
@@ -1,130 +0,0 @@
-module pack_unpack_complex
-#include "config-f90.h"
-  implicit none
-
-#ifdef WITH_OPENMP
-  public pack_row_complex_cpu_openmp
-#else
-  public pack_row_complex_cpu
-#endif
-  contains
-#ifdef WITH_OPENMP
-         subroutine pack_row_complex_cpu_openmp(a, row, n, stripe_width, stripe_count, max_threads, thread_width, l_nev)
-#else
-         subroutine pack_row_complex_cpu(a, row, n, stripe_width, last_stripe_width, stripe_count)
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-           use timings
-#endif
-           use precision
-           implicit none
-#ifdef WITH_OPENMP
-           integer(kind=ik), intent(in) :: stripe_width, stripe_count, max_threads, thread_width, l_nev
-           complex(kind=ck), intent(in) :: a(:,:,:,:)
-#else
-           integer(kind=ik), intent(in) :: stripe_width, last_stripe_width, stripe_count
-           complex(kind=ck), intent(in) :: a(:,:,:)
-#endif
-           complex(kind=ck)             :: row(:)
-           integer(kind=ik)             :: n, i, noff, nl, nt
-
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-           call timer%start("pack_row_complex_cpu_openmp")
-#else
-           call timer%start("pack_row_complex_cpu")
-#endif
-#endif
-
-#ifdef WITH_OPENMP
-           do nt = 1, max_threads
-             do i = 1, stripe_count
-               noff = (nt-1)*thread_width + (i-1)*stripe_width
-               nl   = min(stripe_width, nt*thread_width-noff, l_nev-noff)
-               if (nl<=0) exit
-               row(noff+1:noff+nl) = a(1:nl,n,i,nt)
-             enddo
-           enddo
-#else
-           do i=1,stripe_count
-             nl = merge(stripe_width, last_stripe_width, i<stripe_count)
-             noff = (i-1)*stripe_width
-             row(noff+1:noff+nl) = a(1:nl,n,i)
-           enddo
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-           call timer%stop("pack_row_complex_cpu_openmp")
-#else
-           call timer%stop("pack_row_complex_cpu")
-#endif
-#endif
-
-#ifdef WITH_OPENMP
-         end subroutine pack_row_complex_cpu_openmp
-#else
-         end subroutine pack_row_complex_cpu
-#endif
-
-#ifdef WITH_OPENMP
-         subroutine unpack_row_complex_cpu_openmp(a, row, n, my_thread, stripe_count, thread_width, stripe_width, l_nev)
-#ifdef HAVE_DETAILED_TIMINGS
-           use timings
-#endif
-           use precision
-           implicit none
-
-           ! Private variables in OMP regions (my_thread) should better be in the argument list!
-           integer(kind=ik), intent(in)  :: n, my_thread
-           integer(kind=ik), intent(in)  :: stripe_count, thread_width, stripe_width, l_nev
-           complex(kind=ck), intent(in)  :: row(:)
-           complex(kind=ck)              :: a(:,:,:,:)
-           integer(kind=ik)              :: i, noff, nl
-
-#ifdef HAVE_DETAILED_TIMINGS
-           call timer%start("unpack_row_complex_cpu_openmp")
-#endif
-
-           do i=1,stripe_count
-             noff = (my_thread-1)*thread_width + (i-1)*stripe_width
-             nl   = min(stripe_width, my_thread*thread_width-noff, l_nev-noff)
-             if (nl<=0) exit
-             a(1:nl,n,i,my_thread) = row(noff+1:noff+nl)
-           enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-           call timer%stop("unpack_row_complex_cpu_openmp")
-#endif
-         end subroutine unpack_row_complex_cpu_openmp
-#else /* WITH_OPENMP */
-
-         subroutine unpack_row_complex_cpu(a, row, n, stripe_count, stripe_width, last_stripe_width)
-#ifdef HAVE_DETAILED_TIMINGS
-           use timings
-#endif
-           use precision
-           implicit none
-           integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, n
-           complex(kind=ck), intent(in) :: row(:)
-           complex(kind=ck)             :: a(:,:,:)
-           integer(kind=ik)             :: i, noff, nl
-
-#ifdef HAVE_DETAILED_TIMINGS
-           call timer%start("unpack_row_complex_cpu")
-#endif
-           do i=1,stripe_count
-             nl = merge(stripe_width, last_stripe_width, i<stripe_count)
-             noff = (i-1)*stripe_width
-             a(1:nl,n,i) = row(noff+1:noff+nl)
-           enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-           call timer%stop("unpack_row_complex_cpu")
-#endif
-
-         end  subroutine unpack_row_complex_cpu
-#endif /* WITH_OPENMP */
-
-end module
diff --git a/src/mod_pack_unpack_real.F90 b/src/mod_pack_unpack_real.F90
deleted file mode 100644
index 7bb58e5f4..000000000
--- a/src/mod_pack_unpack_real.F90
+++ /dev/null
@@ -1,139 +0,0 @@
-module pack_unpack_real
-#include "config-f90.h"
-  implicit none
-
-#ifdef WITH_OPENMP
-  public pack_row_real_cpu_openmp, unpack_row_real_cpu_openmp
-#else
-  public pack_row_real_cpu, unpack_row_real_cpu
-#endif
-  contains
-
-#ifdef WITH_OPENMP
-        subroutine pack_row_real_cpu_openmp(a, row, n, stripe_width, stripe_count, max_threads, thread_width, l_nev)
-#else
-        subroutine pack_row_real_cpu(a, row, n, stripe_width, last_stripe_width, stripe_count)
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-          use timings
-#endif
-          use precision
-          implicit none
-          integer(kind=ik), intent(in) :: n, stripe_count, stripe_width
-#ifdef WITH_OPENMP
-          integer(kind=ik), intent(in) :: max_threads, thread_width, l_nev
-          real(kind=rk), intent(in)    :: a(:,:,:,:)
-#else
-          integer(kind=ik), intent(in) :: last_stripe_width
-          real(kind=rk), intent(in)    :: a(:,:,:)
-#endif
-          real(kind=rk)                :: row(:)
-
-          integer(kind=ik)             :: i, noff, nl
-#ifdef WITH_OPENMP
-          integer(kind=ik)             :: nt
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-          call timer%start("pack_row_real_cpu_openmp")
-
-#else
-          call timer%start("pack_row_real_cpu")
-#endif
-#endif
-
-#ifdef WITH_OPENMP
-          do nt = 1, max_threads
-            do i = 1, stripe_count
-              noff = (nt-1)*thread_width + (i-1)*stripe_width
-              nl   = min(stripe_width, nt*thread_width-noff, l_nev-noff)
-              if (nl<=0) exit
-              row(noff+1:noff+nl) = a(1:nl,n,i,nt)
-            enddo
-          enddo
-#else
-          do i=1,stripe_count
-            nl = merge(stripe_width, last_stripe_width, i<stripe_count)
-            noff = (i-1)*stripe_width
-            row(noff+1:noff+nl) = a(1:nl,n,i)
-          enddo
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-          call timer%stop("pack_row_real_cpu_openmp")
-
-#else
-          call timer%stop("pack_row_real_cpu")
-#endif
-#endif
-
-#ifdef WITH_OPENMP
-        end subroutine pack_row_real_cpu_openmp
-#else
-        end subroutine pack_row_real_cpu
-#endif
-
-#ifdef WITH_OPENMP
-        subroutine unpack_row_real_cpu_openmp(a, row, n, my_thread, stripe_count, thread_width, stripe_width, l_nev)
-#ifdef HAVE_DETAILED_TIMINGS
-          use timings
-#endif
-          use precision
-          implicit none
-
-          ! Private variables in OMP regions (my_thread) should better be in the argument list!
-          integer(kind=ik), intent(in) :: stripe_count, thread_width, stripe_width, l_nev
-          real(kind=rk)                :: a(:,:,:,:)
-          integer(kind=ik), intent(in) :: n, my_thread
-          real(kind=rk), intent(in)    :: row(:)
-          integer(kind=ik)             :: i, noff, nl
-
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%start("unpack_row_real_cpu_openmp")
-#endif
-          do i=1,stripe_count
-            noff = (my_thread-1)*thread_width + (i-1)*stripe_width
-            nl   = min(stripe_width, my_thread*thread_width-noff, l_nev-noff)
-            if(nl<=0) exit
-            a(1:nl,n,i,my_thread) = row(noff+1:noff+nl)
-          enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%stop("unpack_row_real_cpu_openmp")
-#endif
-
-        end subroutine unpack_row_real_cpu_openmp
-
-#else /* WITH_OPENMP */
-        subroutine unpack_row_real_cpu(a, row, n, stripe_count, stripe_width, last_stripe_width)
-#ifdef HAVE_DETAILED_TIMINGS
-          use timings
-#endif
-         use precision
-         implicit none
-
-         integer(kind=ik), intent(in) :: n, stripe_count, stripe_width, last_stripe_width
-         real(kind=rk)                :: row(:)
-         real(kind=rk)                :: a(:,:,:)
-         integer(kind=ik)             :: i, noff, nl
-
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%start("unpack_row_real_cpu")
-#endif
-
-          do i=1,stripe_count
-            nl = merge(stripe_width, last_stripe_width, i<stripe_count)
-            noff = (i-1)*stripe_width
-            a(1:nl,n,i) = row(noff+1:noff+nl)
-          enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%stop("unpack_row_real_cpu")
-#endif
-        end subroutine unpack_row_real_cpu
-#endif /* WITH_OPENMP */
-
-end module
diff --git a/src/mod_precision.f90 b/src/mod_precision.f90
deleted file mode 100644
index 5b10e4eb1..000000000
--- a/src/mod_precision.f90
+++ /dev/null
@@ -1,9 +0,0 @@
-module precision
-  use iso_c_binding, only : C_FLOAT, C_DOUBLE, C_INT32_T, C_INT64_T
-
-  implicit none
-  integer, parameter :: rk  = C_DOUBLE
-  integer, parameter :: ck  = C_DOUBLE
-  integer, parameter :: ik  = C_INT32_T
-  integer, parameter :: lik = C_INT64_T
-end module precision
diff --git a/src/print_available_elpa2_kernels.F90 b/src/print_available_elpa2_kernels.F90
deleted file mode 100644
index 96ba7d1f6..000000000
--- a/src/print_available_elpa2_kernels.F90
+++ /dev/null
@@ -1,126 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-
-
-! ELPA2 -- 2-stage solver for ELPA
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-#include "config-f90.h"
-
-!> \file print_available_elpa2_kernels.F90
-!> \par
-!> \brief Provide information which ELPA2 kernels are available on this system
-!>
-!> \details
-!> It is possible to configure ELPA2 such, that different compute intensive
-!> "ELPA2 kernels" can be choosen at runtime.
-!> The service binary print_available_elpa2_kernels will query the library and tell
-!> whether ELPA2 has been configured in this way, and if this is the case which kernels can be
-!> choosen at runtime.
-!> It will furthermore detail whether ELPA has been configured with OpenMP support
-!>
-!> Synopsis: print_available_elpa2_kernels
-!>
-!> \author A. Marek (MPCDF)
-program print_available_elpa2_kernels
-
-   use precision
-   use ELPA1
-   use ELPA2
-
-   use elpa2_utilities
-
-   implicit none
-
-   integer(kind=ik) :: i
-
-   print *, "This program will give information on the ELPA2 kernels, "
-   print *, "which are available with this library and it will give "
-   print *, "information if (and how) the kernels can be choosen at "
-   print *, "runtime"
-   print *
-   print *
-#ifdef WITH_OPENMP
-   print *, " ELPA supports threads: yes"
-#else
-   print *, " ELPA supports threads: no"
-#endif
-
-   print *, "Information on ELPA2 real case: "
-   print *, "=============================== "
-#ifdef HAVE_ENVIRONMENT_CHECKING
-   print *, " choice via environment variable: yes"
-   print *, " environment variable name      : REAL_ELPA_KERNEL"
-#else
-   print *  " choice via environment variable: no"
-#endif
-   print *
-   print *, " Available real kernels are: "
-   call print_available_real_kernels()
-
-   print *
-   print *
-   print *, "Information on ELPA2 complex case: "
-   print *, "=============================== "
-#ifdef HAVE_ENVIRONMENT_CHECKING
-   print *, " choice via environment variable: yes"
-   print *, " environment variable name      : COMPLEX_ELPA_KERNEL"
-#else
-   print *  " choice via environment variable: no"
-#endif
-   print *
-   print *, " Available complex kernels are: "
-   call print_available_complex_kernels()
-
-end program print_available_elpa2_kernels
diff --git a/src/redist_band.X90 b/src/redist_band.X90
deleted file mode 100644
index 239feb213..000000000
--- a/src/redist_band.X90
+++ /dev/null
@@ -1,323 +0,0 @@
-#if 0
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-#endif
-! --------------------------------------------------------------------------------------------------
-! redist_band: redistributes band from 2D block cyclic form to 1D band
-#if REALCASE==1
-subroutine redist_band_real(r_a, lda, na, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, r_ab)
-#endif
-
-#if COMPLEXCASE==1
-subroutine redist_band_complex(c_a, lda, na, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, c_ab)
-#endif
-
-
-
-#ifdef HAVE_DETAILED_TIMINGS
-   use timings
-#endif
-   use precision
-   implicit none
-
-   integer(kind=ik), intent(in)     :: lda, na, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm
-#if REALCASE==1
-   real(kind=rk), intent(in)        :: r_a(lda, matrixCols)
-#endif
-#if COMPLEXCASE==1
-   complex(kind=ck), intent(in)     :: c_a(lda, matrixCols)
-#endif
-
-
-#if REALCASE==1
-   real(kind=rk), intent(out)       :: r_ab(:,:)
-#endif
-
-#if COMPLEXCASE==1
-   complex(kind=ck), intent(out)    :: c_ab(:,:)
-#endif
-
-   integer(kind=ik), allocatable    :: ncnt_s(:), nstart_s(:), ncnt_r(:), nstart_r(:), &
-                                       global_id(:,:), global_id_tmp(:,:), block_limits(:)
-#if REALCASE==1
-   real(kind=rk), allocatable       :: r_sbuf(:,:,:), r_rbuf(:,:,:), r_buf(:,:)
-#endif
-
-#if COMPLEXCASE==1
-   complex(kind=ck), allocatable    :: c_sbuf(:,:,:), c_rbuf(:,:,:), c_buf(:,:)
-#endif
-   integer(kind=ik)                 :: i, j, my_pe, n_pes, my_prow, np_rows, my_pcol, np_cols, &
-                                       nfact, np, npr, npc, mpierr, is, js
-   integer(kind=ik)                 :: nblocks_total, il, jl, l_rows, l_cols, n_off
-
-#ifdef HAVE_DETAILED_TIMINGS
-#if REALCASE==1
-        call timer%start("redist_band_real")
-#endif
-#if COMPLEXCASE==1
-        call timer%start("redist_band_complex")
-#endif
-
-#endif
-
-   call mpi_comm_rank(mpi_comm,my_pe,mpierr)
-   call mpi_comm_size(mpi_comm,n_pes,mpierr)
-
-   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-   ! Get global_id mapping 2D procssor coordinates to global id
-
-   allocate(global_id(0:np_rows-1,0:np_cols-1))
-#ifdef WITH_OPENMP
-   allocate(global_id_tmp(0:np_rows-1,0:np_cols-1))
-#endif
-   global_id(:,:) = 0
-   global_id(my_prow, my_pcol) = my_pe
-#ifdef WITH_OPENMP
-   global_id_tmp(:,:) = global_id(:,:)
-   call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr)
-   deallocate(global_id_tmp)
-#else
-   call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr)
-#endif
-
-   ! Set work distribution
-
-   nblocks_total = (na-1)/nbw + 1
-
-   allocate(block_limits(0:n_pes))
-   call divide_band(nblocks_total, n_pes, block_limits)
-
-
-   allocate(ncnt_s(0:n_pes-1))
-   allocate(nstart_s(0:n_pes-1))
-   allocate(ncnt_r(0:n_pes-1))
-   allocate(nstart_r(0:n_pes-1))
-
-
-   nfact = nbw/nblk
-
-   ! Count how many blocks go to which PE
-
-   ncnt_s(:) = 0
-   np = 0 ! receiver PE number
-   do j=0,(na-1)/nblk ! loop over rows of blocks
-     if (j/nfact==block_limits(np+1)) np = np+1
-     if (mod(j,np_rows) == my_prow) then
-       do i=0,nfact
-         if (mod(i+j,np_cols) == my_pcol) then
-           ncnt_s(np) = ncnt_s(np) + 1
-         endif
-       enddo
-     endif
-   enddo
-
-   ! Allocate send buffer
-
-#if REALCASE==1
-   allocate(r_sbuf(nblk,nblk,sum(ncnt_s)))
-   r_sbuf(:,:,:) = 0.
-#endif
-#if COMPLEXCASE==1
-   allocate(c_sbuf(nblk,nblk,sum(ncnt_s)))
-   c_sbuf(:,:,:) = 0.
-#endif
-
-   ! Determine start offsets in send buffer
-
-   nstart_s(0) = 0
-   do i=1,n_pes-1
-     nstart_s(i) = nstart_s(i-1) + ncnt_s(i-1)
-   enddo
-
-   ! Fill send buffer
-
-   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
-   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of a
-
-   np = 0
-   do j=0,(na-1)/nblk ! loop over rows of blocks
-     if (j/nfact==block_limits(np+1)) np = np+1
-     if (mod(j,np_rows) == my_prow) then
-       do i=0,nfact
-         if (mod(i+j,np_cols) == my_pcol) then
-           nstart_s(np) = nstart_s(np) + 1
-           js = (j/np_rows)*nblk
-           is = ((i+j)/np_cols)*nblk
-           jl = MIN(nblk,l_rows-js)
-           il = MIN(nblk,l_cols-is)
-
-#if REALCASE==1
-           r_sbuf(1:jl,1:il,nstart_s(np)) = r_a(js+1:js+jl,is+1:is+il)
-#endif
-#if COMPLEXCASE==1
-           c_sbuf(1:jl,1:il,nstart_s(np)) = c_a(js+1:js+jl,is+1:is+il)
-#endif
-         endif
-       enddo
-      endif
-   enddo
-
-   ! Count how many blocks we get from which PE
-
-   ncnt_r(:) = 0
-   do j=block_limits(my_pe)*nfact,min(block_limits(my_pe+1)*nfact-1,(na-1)/nblk)
-     npr = mod(j,np_rows)
-     do i=0,nfact
-       npc = mod(i+j,np_cols)
-       np = global_id(npr,npc)
-       ncnt_r(np) = ncnt_r(np) + 1
-     enddo
-   enddo
-
-   ! Allocate receive buffer
-
-#if REALCASE==1
-   allocate(r_rbuf(nblk,nblk,sum(ncnt_r)))
-#endif
-#if COMPLEXCASE==1
-   allocate(c_rbuf(nblk,nblk,sum(ncnt_r)))
-#endif
-
-   ! Set send counts/send offsets, receive counts/receive offsets
-   ! now actually in variables, not in blocks
-
-   ncnt_s(:) = ncnt_s(:)*nblk*nblk
-
-   nstart_s(0) = 0
-   do i=1,n_pes-1
-     nstart_s(i) = nstart_s(i-1) + ncnt_s(i-1)
-   enddo
-
-   ncnt_r(:) = ncnt_r(:)*nblk*nblk
-
-   nstart_r(0) = 0
-   do i=1,n_pes-1
-     nstart_r(i) = nstart_r(i-1) + ncnt_r(i-1)
-   enddo
-
-   ! Exchange all data with MPI_Alltoallv
-
-#if REALCASE==1
-    call MPI_Alltoallv(r_sbuf,ncnt_s,nstart_s,MPI_REAL8,r_rbuf,ncnt_r,nstart_r,MPI_REAL8,mpi_comm,mpierr)
-#endif
-#if COMPLEXCASE==1
-    call MPI_Alltoallv(c_sbuf,ncnt_s,nstart_s,MPI_COMPLEX16,c_rbuf,ncnt_r,nstart_r,MPI_COMPLEX16,mpi_comm,mpierr)
-#endif
-
-   ! set band from receive buffer
-
-   ncnt_r(:) = ncnt_r(:)/(nblk*nblk)
-
-   nstart_r(0) = 0
-   do i=1,n_pes-1
-     nstart_r(i) = nstart_r(i-1) + ncnt_r(i-1)
-   enddo
-
-#if REALCASE==1
-   allocate(r_buf((nfact+1)*nblk,nblk))
-#endif
-#if COMPLEXCASE==1
-   allocate(c_buf((nfact+1)*nblk,nblk))
-#endif
-
-   ! n_off: Offset of ab within band
-   n_off = block_limits(my_pe)*nbw
-
-   do j=block_limits(my_pe)*nfact,min(block_limits(my_pe+1)*nfact-1,(na-1)/nblk)
-     npr = mod(j,np_rows)
-     do i=0,nfact
-       npc = mod(i+j,np_cols)
-       np = global_id(npr,npc)
-       nstart_r(np) = nstart_r(np) + 1
-#if REALCASE==1
-       r_buf(i*nblk+1:i*nblk+nblk,:) = transpose(r_rbuf(:,:,nstart_r(np)))
-#endif
-#if COMPLEXCASE==1
-       c_buf(i*nblk+1:i*nblk+nblk,:) = conjg(transpose(c_rbuf(:,:,nstart_r(np))))
-#endif
-     enddo
-     do i=1,MIN(nblk,na-j*nblk)
-#if REALCASE==1
-       r_ab(1:nbw+1,i+j*nblk-n_off) = r_buf(i:i+nbw,i)
-#endif
-#if COMPLEXCASE==1
-       c_ab(1:nbw+1,i+j*nblk-n_off) = c_buf(i:i+nbw,i)
-#endif
-     enddo
-   enddo
-
-   deallocate(ncnt_s, nstart_s)
-   deallocate(ncnt_r, nstart_r)
-   deallocate(global_id)
-   deallocate(block_limits)
-
-#if REALCASE==1
-   deallocate(r_sbuf, r_rbuf, r_buf)
-#endif
-#if COMPLEXCASE==1
-   deallocate(c_sbuf, c_rbuf, c_buf)
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-#if REALCASE==1
-   call timer%stop("redist_band_real")
-#endif
-#if COMPLEXCASE==1
-   call timer%stop("redist_band_complex")
-#endif
-
-#endif
-
-
-end subroutine
-
diff --git a/src/timer.F90 b/src/timer.F90
deleted file mode 100644
index 059dd5915..000000000
--- a/src/timer.F90
+++ /dev/null
@@ -1,9 +0,0 @@
-module timings
-  use ftimings
-
-  implicit none
-
-  type(timer_t) :: timer
-
-
-end module timings
diff --git a/test/c_test_programs/elpa1_test_complex_c_version.c b/test/c_test_programs/elpa1_test_complex_c_version.c
deleted file mode 100644
index 3039a36c2..000000000
--- a/test/c_test_programs/elpa1_test_complex_c_version.c
+++ /dev/null
@@ -1,225 +0,0 @@
-/*     This file is part of ELPA. */
-/*  */
-/*     The ELPA library was originally created by the ELPA consortium, */
-/*     consisting of the following organizations: */
-/*  */
-/*     - Max Planck Computing and Data Facility (MPCDF), formerly known as */
-/*       Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), */
-/*     - Bergische Universität Wuppertal, Lehrstuhl für angewandte */
-/*       Informatik, */
-/*     - Technische Universität München, Lehrstuhl für Informatik mit */
-/*       Schwerpunkt Wissenschaftliches Rechnen , */
-/*     - Fritz-Haber-Institut, Berlin, Abt. Theorie, */
-/*     - Max-Plack-Institut für Mathematik in den Naturwissenschaften, */
-/*       Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, */
-/*       and */
-/*     - IBM Deutschland GmbH */
-/*  */
-/*  */
-/*     More information can be found here: */
-/*     http://elpa.mpcdf.mpg.de/ */
-/*  */
-/*     ELPA is free software: you can redistribute it and/or modify */
-/*     it under the terms of the version 3 of the license of the */
-/*     GNU Lesser General Public License as published by the Free */
-/*     Software Foundation. */
-/*  */
-/*     ELPA is distributed in the hope that it will be useful, */
-/*     but WITHOUT ANY WARRANTY; without even the implied warranty of */
-/*     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the */
-/*     GNU Lesser General Public License for more details. */
-/*  */
-/*     You should have received a copy of the GNU Lesser General Public License */
-/*     along with ELPA.  If not, see <http://www.gnu.org/licenses/> */
-/*  */
-/*     ELPA reflects a substantial effort on the part of the original */
-/*     ELPA consortium, and we ask you to respect the spirit of the */
-/*     license that we chose: i.e., please contribute any changes you */
-/*     may have back to the original ELPA library distribution, and keep */
-/*     any derivatives of ELPA under the same license that we chose for */
-/*     the original distribution, the GNU Lesser General Public License. */
-/*  */
-/*  */
-
-#include "config-f90.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-#include <math.h>
-
-#include <elpa/elpa.h>
-#include <test/shared_sources/generated.h>
-#include <complex.h>
-
-int main(int argc, char** argv) {
-   int myid;
-   int nprocs;
-
-   int na, nev, nblk;
-
-   int status;
-
-   int np_cols, np_rows, np_colsStart;
-
-   int my_blacs_ctxt, nprow, npcol, my_prow, my_pcol;
-
-   int mpierr;
-
-   int my_mpi_comm_world;
-   int mpi_comm_rows, mpi_comm_cols;
-
-   int info, *sc_desc;
-
-   int na_rows, na_cols;
-   double startVal;
-
-   complex double *a, *z, *as, *tmp1, *tmp2;
-
-   double *ev, *xr;
-
-   int *iseed;
-
-   int success;
-
-   MPI_Init(&argc, &argv);
-   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-
-   na = 1000;
-   nev = 500;
-   nblk = 16;
-
-   if (myid == 0) {
-     printf("This is the c version of an ELPA test-programm\n");
-     printf("\n");
-     printf("It will call the 1stage ELPA complex solver for a matrix\n");
-     printf("of matrix size %d. It will compute %d eigenvalues\n",na,nev);
-     printf("and uses a blocksize of %d\n",nblk);
-     printf("\n");
-     printf("This is an example program with much less functionality\n");
-     printf("as it's Fortran counterpart. It's only purpose is to show how \n");
-     printf("to evoke ELPA1 from a c programm\n");
-
-     printf("\n");
-
-   }
-
-   status = 0;
-
-   startVal = sqrt((double) nprocs);
-   np_colsStart = (int) round(startVal);
-   for (np_cols=np_colsStart;np_cols>1;np_cols--){
-     if (nprocs %np_cols ==0){
-     break;
-     }
-   }
-
-   np_rows = nprocs/np_cols;
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Number of processor rows %d, cols %d, total %d \n",np_rows,np_cols,nprocs);
-   }
-
-   /* set up blacs */
-   /* convert communicators before */
-   my_mpi_comm_world = MPI_Comm_c2f(MPI_COMM_WORLD);
-   set_up_blacsgrid_from_fortran(my_mpi_comm_world, &my_blacs_ctxt, &np_rows, &np_cols, &nprow, &npcol, &my_prow, &my_pcol);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past BLACS_Gridinfo...\n");
-     printf("\n");
-   }
-
-   /* get the ELPA row and col communicators. */
-   /* These are NOT usable in C without calling the MPI_Comm_f2c function on them !! */
-   my_mpi_comm_world = MPI_Comm_c2f(MPI_COMM_WORLD);
-   mpierr = get_elpa_communicators(my_mpi_comm_world, my_prow, my_pcol, &mpi_comm_rows, &mpi_comm_cols);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past split communicator setup for rows and columns...\n");
-     printf("\n");
-   }
-
-   sc_desc = malloc(9*sizeof(int));
-
-   set_up_blacs_descriptor_from_fortran(na, nblk, my_prow, my_pcol, np_rows, np_cols, &na_rows, &na_cols, sc_desc, my_blacs_ctxt, &info);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past scalapack descriptor setup...\n");
-     printf("\n");
-   }
-
-   /* allocate the matrices needed for elpa */
-   if (myid == 0) {
-     printf("\n");
-     printf("Allocating matrices with na_rows=%d and na_cols=%d\n",na_rows, na_cols);
-     printf("\n");
-   }
-
-   a  = malloc(na_rows*na_cols*sizeof(complex double));
-   z  = malloc(na_rows*na_cols*sizeof(complex double));
-   as = malloc(na_rows*na_cols*sizeof(complex double));
-
-   xr = malloc(na_rows*na_cols*sizeof(double));
-
-
-   ev = malloc(na*sizeof(double));
-
-   tmp1  = malloc(na_rows*na_cols*sizeof(complex double));
-   tmp2 = malloc(na_rows*na_cols*sizeof(complex double));
-
-   iseed = malloc(4096*sizeof(int));
-
-   prepare_matrix_complex_from_fortran(na, myid, na_rows, na_cols, sc_desc, iseed, xr, a, z, as);
-
-   free(xr);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Entering ELPA 1stage complex solver\n");
-     printf("\n");
-   }
-
-   mpierr = MPI_Barrier(MPI_COMM_WORLD);
-
-   success = elpa_solve_evp_complex_1stage(na, nev, a, na_rows, ev, z, na_rows, nblk, na_cols, mpi_comm_rows, mpi_comm_cols);
-
-   if (success != 1) {
-     printf("error in ELPA solve \n");
-     mpierr = MPI_Abort(MPI_COMM_WORLD, 99);
-   }
-
-
-   if (myid == 0) {
-     printf("\n");
-     printf("1stage ELPA complex solver complete\n");
-     printf("\n");
-   }
-
-   /* check the results */
-   status = check_correctness_complex_from_fortran(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2);
-
-   if (status !=0){
-     printf("The computed EVs are not correct !\n");
-   }
-   if (status ==0){
-     printf("All ok!\n");
-   }
-
-   free(sc_desc);
-   free(a);
-   free(z);
-   free(as);
-
-   free(tmp1);
-   free(tmp2);
-
-   MPI_Finalize();
-
-   return 0;
-}
diff --git a/test/c_test_programs/elpa1_test_real_c_version.c b/test/c_test_programs/elpa1_test_real_c_version.c
deleted file mode 100644
index 547a3e76a..000000000
--- a/test/c_test_programs/elpa1_test_real_c_version.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/*     This file is part of ELPA. */
-/*  */
-/*     The ELPA library was originally created by the ELPA consortium, */
-/*     consisting of the following organizations: */
-/*  */
-/*     - Max Planck Computing and Data Facility (MPCDF), formerly known as */
-/*       Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), */
-/*     - Bergische Universität Wuppertal, Lehrstuhl für angewandte */
-/*       Informatik, */
-/*     - Technische Universität München, Lehrstuhl für Informatik mit */
-/*       Schwerpunkt Wissenschaftliches Rechnen , */
-/*     - Fritz-Haber-Institut, Berlin, Abt. Theorie, */
-/*     - Max-Plack-Institut für Mathematik in den Naturwissenschaften, */
-/*       Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, */
-/*       and */
-/*     - IBM Deutschland GmbH */
-/*  */
-/*  */
-/*     More information can be found here: */
-/*     http://elpa.mpcdf.mpg.de/ */
-/*  */
-/*     ELPA is free software: you can redistribute it and/or modify */
-/*     it under the terms of the version 3 of the license of the */
-/*     GNU Lesser General Public License as published by the Free */
-/*     Software Foundation. */
-/*  */
-/*     ELPA is distributed in the hope that it will be useful, */
-/*     but WITHOUT ANY WARRANTY; without even the implied warranty of */
-/*     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the */
-/*     GNU Lesser General Public License for more details. */
-/*  */
-/*     You should have received a copy of the GNU Lesser General Public License */
-/*     along with ELPA.  If not, see <http://www.gnu.org/licenses/> */
-/*  */
-/*     ELPA reflects a substantial effort on the part of the original */
-/*     ELPA consortium, and we ask you to respect the spirit of the */
-/*     license that we chose: i.e., please contribute any changes you */
-/*     may have back to the original ELPA library distribution, and keep */
-/*     any derivatives of ELPA under the same license that we chose for */
-/*     the original distribution, the GNU Lesser General Public License. */
-/*  */
-/*  */
-
-#include "config-f90.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-#include <math.h>
-
-#include <elpa/elpa.h>
-
-#include "test/shared_sources/generated.h"
-
-int main(int argc, char** argv) {
-   int myid;
-   int nprocs;
-
-   int na, nev, nblk;
-
-   int status;
-
-   int np_cols, np_rows, np_colsStart;
-
-   int my_blacs_ctxt, nprow, npcol, my_prow, my_pcol;
-
-   int mpierr;
-
-   int my_mpi_comm_world;
-   int mpi_comm_rows, mpi_comm_cols;
-
-   int info, *sc_desc;
-
-   int na_rows, na_cols;
-   double startVal;
-
-   double *a, *z, *as, *ev, *tmp1, *tmp2;
-
-   int *iseed;
-
-   int success;
-
-   MPI_Init(&argc, &argv);
-   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-
-   na = 1000;
-   nev = 500;
-   nblk = 16;
-
-   if (myid == 0) {
-     printf("This is the c version of an ELPA test-programm\n");
-     printf("\n");
-     printf("It will call the 1stage ELPA real solver for an\n");
-     printf("of matrix size %d. It will compute %d eigenvalues\n",na,nev);
-     printf("and uses a blocksize of %d\n",nblk);
-     printf("\n");
-     printf("This is an example program with much less functionality\n");
-     printf("as it's Fortran counterpart. It's only purpose is to show how \n");
-     printf("to evoke ELPA1 from a c programm\n");
-     printf("\n");
-
-   }
-
-   status = 0;
-
-   startVal = sqrt((double) nprocs);
-   np_colsStart = (int) round(startVal);
-   for (np_cols=np_colsStart;np_cols>1;np_cols--){
-     if (nprocs %np_cols ==0){
-     break;
-     }
-   }
-
-   np_rows = nprocs/np_cols;
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Number of processor rows %d, cols %d, total %d \n",np_rows,np_cols,nprocs);
-   }
-
-   /* set up blacs */
-   /* convert communicators before */
-   my_mpi_comm_world = MPI_Comm_c2f(MPI_COMM_WORLD);
-   set_up_blacsgrid_from_fortran(my_mpi_comm_world, &my_blacs_ctxt, &np_rows, &np_cols, &nprow, &npcol, &my_prow, &my_pcol);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past BLACS_Gridinfo...\n");
-     printf("\n");
-   }
-
-   /* get the ELPA row and col communicators. */
-   /* These are NOT usable in C without calling the MPI_Comm_f2c function on them !! */
-   my_mpi_comm_world = MPI_Comm_c2f(MPI_COMM_WORLD);
-   mpierr = get_elpa_communicators(my_mpi_comm_world, my_prow, my_pcol, &mpi_comm_rows, &mpi_comm_cols);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past split communicator setup for rows and columns...\n");
-     printf("\n");
-   }
-
-   sc_desc = malloc(9*sizeof(int));
-
-   set_up_blacs_descriptor_from_fortran(na, nblk, my_prow, my_pcol, np_rows, np_cols, &na_rows, &na_cols, sc_desc, my_blacs_ctxt, &info);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past scalapack descriptor setup...\n");
-     printf("\n");
-   }
-
-   /* allocate the matrices needed for elpa */
-   if (myid == 0) {
-     printf("\n");
-     printf("Allocating matrices with na_rows=%d and na_cols=%d\n",na_rows, na_cols);
-     printf("\n");
-   }
-
-   a  = malloc(na_rows*na_cols*sizeof(double));
-   z  = malloc(na_rows*na_cols*sizeof(double));
-   as = malloc(na_rows*na_cols*sizeof(double));
-
-
-   ev = malloc(na*sizeof(double));
-
-   tmp1  = malloc(na_rows*na_cols*sizeof(double));
-   tmp2 = malloc(na_rows*na_cols*sizeof(double));
-
-   iseed = malloc(4096*sizeof(int));
-
-   prepare_matrix_real_from_fortran(na, myid, na_rows, na_cols, sc_desc, iseed, a, z, as);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Entering ELPA 1stage real solver\n");
-     printf("\n");
-   }
-
-   mpierr = MPI_Barrier(MPI_COMM_WORLD);
-
-   success = elpa_solve_evp_real_1stage(na, nev, a, na_rows, ev, z, na_rows, nblk, na_cols, mpi_comm_rows, mpi_comm_cols);
-
-   if (success != 1) {
-     printf("error in ELPA solve \n");
-     mpierr = MPI_Abort(MPI_COMM_WORLD, 99);
-   }
-
-
-   if (myid == 0) {
-     printf("\n");
-     printf("1stage ELPA real solver complete\n");
-     printf("\n");
-   }
-
-   /* check the results */
-   status = check_correctness_real_from_fortran(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2);
-
-   if (status !=0){
-     printf("The computed EVs are not correct !\n");
-   }
-   if (status ==0){
-     printf("All ok!\n");
-   }
-
-   free(sc_desc);
-   free(a);
-   free(z);
-   free(as);
-
-   free(tmp1);
-   free(tmp2);
-
-   MPI_Finalize();
-
-   return 0;
-}
diff --git a/test/c_test_programs/elpa2_test_complex_c_version.c b/test/c_test_programs/elpa2_test_complex_c_version.c
deleted file mode 100644
index 0033f1cb7..000000000
--- a/test/c_test_programs/elpa2_test_complex_c_version.c
+++ /dev/null
@@ -1,229 +0,0 @@
-/*     This file is part of ELPA. */
-/*  */
-/*     The ELPA library was originally created by the ELPA consortium, */
-/*     consisting of the following organizations: */
-/*  */
-/*     - Max Planck Computing and Data Facility (MPCDF), formerly known as */
-/*       Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), */
-/*     - Bergische Universität Wuppertal, Lehrstuhl für angewandte */
-/*       Informatik, */
-/*     - Technische Universität München, Lehrstuhl für Informatik mit */
-/*       Schwerpunkt Wissenschaftliches Rechnen , */
-/*     - Fritz-Haber-Institut, Berlin, Abt. Theorie, */
-/*     - Max-Plack-Institut für Mathematik in den Naturwissenschaften, */
-/*       Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, */
-/*       and */
-/*     - IBM Deutschland GmbH */
-/*  */
-/*  */
-/*     More information can be found here: */
-/*     http://elpa.mpcdf.mpg.de/ */
-/*  */
-/*     ELPA is free software: you can redistribute it and/or modify */
-/*     it under the terms of the version 3 of the license of the */
-/*     GNU Lesser General Public License as published by the Free */
-/*     Software Foundation. */
-/*  */
-/*     ELPA is distributed in the hope that it will be useful, */
-/*     but WITHOUT ANY WARRANTY; without even the implied warranty of */
-/*     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the */
-/*     GNU Lesser General Public License for more details. */
-/*  */
-/*     You should have received a copy of the GNU Lesser General Public License */
-/*     along with ELPA.  If not, see <http://www.gnu.org/licenses/> */
-/*  */
-/*     ELPA reflects a substantial effort on the part of the original */
-/*     ELPA consortium, and we ask you to respect the spirit of the */
-/*     license that we chose: i.e., please contribute any changes you */
-/*     may have back to the original ELPA library distribution, and keep */
-/*     any derivatives of ELPA under the same license that we chose for */
-/*     the original distribution, the GNU Lesser General Public License. */
-/*  */
-/*  */
-
-#include "config-f90.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-#include <math.h>
-
-#include <elpa/elpa.h>
-#include <test/shared_sources/generated.h>
-#include <complex.h>
-
-int main(int argc, char** argv) {
-   int myid;
-   int nprocs;
-
-   int na, nev, nblk;
-
-   int status;
-
-   int np_cols, np_rows, np_colsStart;
-
-   int my_blacs_ctxt, nprow, npcol, my_prow, my_pcol;
-
-   int mpierr;
-
-   int my_mpi_comm_world;
-   int mpi_comm_rows, mpi_comm_cols;
-
-   int info, *sc_desc;
-
-   int na_rows, na_cols;
-   double startVal;
-
-   complex double *a, *z, *as, *tmp1, *tmp2;
-
-   double *ev, *xr;
-
-   int *iseed;
-
-   int success;
-
-   int THIS_COMPLEX_ELPA_KERNEL_API;
-
-   MPI_Init(&argc, &argv);
-   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-
-   na = 1000;
-   nev = 500;
-   nblk = 16;
-
-   if (myid == 0) {
-     printf("This is the c version of an ELPA test-programm\n");
-     printf("\n");
-     printf("It will call the 1stage ELPA complex solver for a matrix\n");
-     printf("of matrix size %d. It will compute %d eigenvalues\n",na,nev);
-     printf("and uses a blocksize of %d\n",nblk);
-     printf("\n");
-     printf("This is an example program with much less functionality\n");
-     printf("as it's Fortran counterpart. It's only purpose is to show how \n");
-     printf("to evoke ELPA1 from a c programm\n");
-
-     printf("\n");
-
-   }
-
-   status = 0;
-
-   startVal = sqrt((double) nprocs);
-   np_colsStart = (int) round(startVal);
-   for (np_cols=np_colsStart;np_cols>1;np_cols--){
-     if (nprocs %np_cols ==0){
-     break;
-     }
-   }
-
-   np_rows = nprocs/np_cols;
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Number of processor rows %d, cols %d, total %d \n",np_rows,np_cols,nprocs);
-   }
-
-   /* set up blacs */
-   /* convert communicators before */
-   my_mpi_comm_world = MPI_Comm_c2f(MPI_COMM_WORLD);
-   set_up_blacsgrid_from_fortran(my_mpi_comm_world, &my_blacs_ctxt, &np_rows, &np_cols, &nprow, &npcol, &my_prow, &my_pcol);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past BLACS_Gridinfo...\n");
-     printf("\n");
-   }
-
-   /* get the ELPA row and col communicators. */
-   /* These are NOT usable in C without calling the MPI_Comm_f2c function on them !! */
-   my_mpi_comm_world = MPI_Comm_c2f(MPI_COMM_WORLD);
-   mpierr = get_elpa_communicators(my_mpi_comm_world, my_prow, my_pcol, &mpi_comm_rows, &mpi_comm_cols);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past split communicator setup for rows and columns...\n");
-     printf("\n");
-   }
-
-   sc_desc = malloc(9*sizeof(int));
-
-   set_up_blacs_descriptor_from_fortran(na, nblk, my_prow, my_pcol, np_rows, np_cols, &na_rows, &na_cols, sc_desc, my_blacs_ctxt, &info);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past scalapack descriptor setup...\n");
-     printf("\n");
-   }
-
-   /* allocate the matrices needed for elpa */
-   if (myid == 0) {
-     printf("\n");
-     printf("Allocating matrices with na_rows=%d and na_cols=%d\n",na_rows, na_cols);
-     printf("\n");
-   }
-
-   a  = malloc(na_rows*na_cols*sizeof(complex double));
-   z  = malloc(na_rows*na_cols*sizeof(complex double));
-   as = malloc(na_rows*na_cols*sizeof(complex double));
-
-   xr = malloc(na_rows*na_cols*sizeof(double));
-
-
-   ev = malloc(na*sizeof(double));
-
-   tmp1  = malloc(na_rows*na_cols*sizeof(complex double));
-   tmp2 = malloc(na_rows*na_cols*sizeof(complex double));
-
-   iseed = malloc(4096*sizeof(int));
-
-   prepare_matrix_complex_from_fortran(na, myid, na_rows, na_cols, sc_desc, iseed, xr, a, z, as);
-
-   free(xr);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Entering ELPA 2stage complex solver\n");
-     printf("\n");
-   }
-
-   mpierr = MPI_Barrier(MPI_COMM_WORLD);
-   THIS_COMPLEX_ELPA_KERNEL_API = ELPA2_COMPLEX_KERNEL_GENERIC;
-   success = elpa_solve_evp_complex_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, na_cols, mpi_comm_rows, mpi_comm_cols, my_mpi_comm_world, THIS_COMPLEX_ELPA_KERNEL_API);
-
-   if (success != 1) {
-     printf("error in ELPA solve \n");
-     mpierr = MPI_Abort(MPI_COMM_WORLD, 99);
-   }
-
-
-   if (myid == 0) {
-     printf("\n");
-     printf("2stage ELPA complex solver complete\n");
-     printf("\n");
-   }
-
-   /* check the results */
-   status = check_correctness_complex_from_fortran(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2);
-
-   if (status !=0){
-     printf("The computed EVs are not correct !\n");
-   }
-   if (status ==0){
-     if (myid == 0) {
-       printf("All ok!\n");
-     }
-   }
-
-   free(sc_desc);
-   free(a);
-   free(z);
-   free(as);
-
-   free(tmp1);
-   free(tmp2);
-
-   MPI_Finalize();
-
-   return 0;
-}
diff --git a/test/c_test_programs/elpa2_test_real_c_version.c b/test/c_test_programs/elpa2_test_real_c_version.c
deleted file mode 100644
index bfc46c009..000000000
--- a/test/c_test_programs/elpa2_test_real_c_version.c
+++ /dev/null
@@ -1,223 +0,0 @@
-/*     This file is part of ELPA. */
-/*  */
-/*     The ELPA library was originally created by the ELPA consortium, */
-/*     consisting of the following organizations: */
-/*  */
-/*     - Max Planck Computing and Data Facility (MPCDF), formerly known as */
-/*       Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), */
-/*     - Bergische Universität Wuppertal, Lehrstuhl für angewandte */
-/*       Informatik, */
-/*     - Technische Universität München, Lehrstuhl für Informatik mit */
-/*       Schwerpunkt Wissenschaftliches Rechnen , */
-/*     - Fritz-Haber-Institut, Berlin, Abt. Theorie, */
-/*     - Max-Plack-Institut für Mathematik in den Naturwissenschaften, */
-/*       Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, */
-/*       and */
-/*     - IBM Deutschland GmbH */
-/*  */
-/*  */
-/*     More information can be found here: */
-/*     http://elpa.mpcdf.mpg.de/ */
-/*  */
-/*     ELPA is free software: you can redistribute it and/or modify */
-/*     it under the terms of the version 3 of the license of the */
-/*     GNU Lesser General Public License as published by the Free */
-/*     Software Foundation. */
-/*  */
-/*     ELPA is distributed in the hope that it will be useful, */
-/*     but WITHOUT ANY WARRANTY; without even the implied warranty of */
-/*     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the */
-/*     GNU Lesser General Public License for more details. */
-/*  */
-/*     You should have received a copy of the GNU Lesser General Public License */
-/*     along with ELPA.  If not, see <http://www.gnu.org/licenses/> */
-/*  */
-/*     ELPA reflects a substantial effort on the part of the original */
-/*     ELPA consortium, and we ask you to respect the spirit of the */
-/*     license that we chose: i.e., please contribute any changes you */
-/*     may have back to the original ELPA library distribution, and keep */
-/*     any derivatives of ELPA under the same license that we chose for */
-/*     the original distribution, the GNU Lesser General Public License. */
-/*  */
-/*  */
-
-#include "config-f90.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-#include <math.h>
-
-#include <elpa/elpa.h>
-#include <test/shared_sources/generated.h>
-
-int main(int argc, char** argv) {
-   int myid;
-   int nprocs;
-
-   int na, nev, nblk;
-
-   int status;
-
-   int np_cols, np_rows, np_colsStart;
-
-   int my_blacs_ctxt, nprow, npcol, my_prow, my_pcol;
-
-   int mpierr;
-
-   int my_mpi_comm_world;
-   int mpi_comm_rows, mpi_comm_cols;
-
-   int info, *sc_desc;
-
-   int na_rows, na_cols;
-   double startVal;
-
-   double *a, *z, *as, *ev, *tmp1, *tmp2;
-
-   int *iseed;
-
-   int success;
-
-   int useQr, THIS_REAL_ELPA_KERNEL_API;
-
-   MPI_Init(&argc, &argv);
-   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-
-   na = 1000;
-   nev = 500;
-   nblk = 16;
-
-   if (myid == 0) {
-     printf("This is the c version of an ELPA test-programm\n");
-     printf("\n");
-     printf("It will call the 1stage ELPA real solver for an\n");
-     printf("of matrix size %d. It will compute %d eigenvalues\n",na,nev);
-     printf("and uses a blocksize of %d\n",nblk);
-     printf("\n");
-     printf("This is an example program with much less functionality\n");
-     printf("as it's Fortran counterpart. It's only purpose is to show how \n");
-     printf("to evoke ELPA1 from a c programm\n");
-     printf("\n");
-
-   }
-
-   status = 0;
-
-   startVal = sqrt((double) nprocs);
-   np_colsStart = (int) round(startVal);
-   for (np_cols=np_colsStart;np_cols>1;np_cols--){
-     if (nprocs %np_cols ==0){
-     break;
-     }
-   }
-
-   np_rows = nprocs/np_cols;
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Number of processor rows %d, cols %d, total %d \n",np_rows,np_cols,nprocs);
-   }
-
-   /* set up blacs */
-   /* convert communicators before */
-   my_mpi_comm_world = MPI_Comm_c2f(MPI_COMM_WORLD);
-   set_up_blacsgrid_from_fortran(my_mpi_comm_world, &my_blacs_ctxt, &np_rows, &np_cols, &nprow, &npcol, &my_prow, &my_pcol);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past BLACS_Gridinfo...\n");
-     printf("\n");
-   }
-
-   /* get the ELPA row and col communicators. */
-   /* These are NOT usable in C without calling the MPI_Comm_f2c function on them !! */
-   my_mpi_comm_world = MPI_Comm_c2f(MPI_COMM_WORLD);
-   mpierr = get_elpa_communicators(my_mpi_comm_world, my_prow, my_pcol, &mpi_comm_rows, &mpi_comm_cols);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past split communicator setup for rows and columns...\n");
-     printf("\n");
-   }
-
-   sc_desc = malloc(9*sizeof(int));
-
-   set_up_blacs_descriptor_from_fortran(na, nblk, my_prow, my_pcol, np_rows, np_cols, &na_rows, &na_cols, sc_desc, my_blacs_ctxt, &info);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past scalapack descriptor setup...\n");
-     printf("\n");
-   }
-
-   /* allocate the matrices needed for elpa */
-   if (myid == 0) {
-     printf("\n");
-     printf("Allocating matrices with na_rows=%d and na_cols=%d\n",na_rows, na_cols);
-     printf("\n");
-   }
-
-   a  = malloc(na_rows*na_cols*sizeof(double));
-   z  = malloc(na_rows*na_cols*sizeof(double));
-   as = malloc(na_rows*na_cols*sizeof(double));
-
-
-   ev = malloc(na*sizeof(double));
-
-   tmp1  = malloc(na_rows*na_cols*sizeof(double));
-   tmp2 = malloc(na_rows*na_cols*sizeof(double));
-
-   iseed = malloc(4096*sizeof(int));
-
-   prepare_matrix_real_from_fortran(na, myid, na_rows, na_cols, sc_desc, iseed, a, z, as);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Entering ELPA 2stage real solver\n");
-     printf("\n");
-   }
-
-   mpierr = MPI_Barrier(MPI_COMM_WORLD);
-   useQr = 0;
-   THIS_REAL_ELPA_KERNEL_API = ELPA2_REAL_KERNEL_GENERIC;
-
-   success = elpa_solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, na_cols, mpi_comm_rows, mpi_comm_cols, my_mpi_comm_world, THIS_REAL_ELPA_KERNEL_API, useQr);
-
-   if (success != 1) {
-     printf("error in ELPA solve \n");
-     mpierr = MPI_Abort(MPI_COMM_WORLD, 99);
-   }
-
-
-   if (myid == 0) {
-     printf("\n");
-     printf("2stage ELPA real solver complete\n");
-     printf("\n");
-   }
-
-   /* check the results */
-   status = check_correctness_real_from_fortran(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2);
-
-   if (status !=0){
-     printf("The computed EVs are not correct !\n");
-   }
-   if (status ==0){
-     if (myid ==0) {
-       printf("All ok!\n");
-     }
-   }
-
-   free(sc_desc);
-   free(a);
-   free(z);
-   free(as);
-
-   free(tmp1);
-   free(tmp2);
-
-   MPI_Finalize();
-
-   return 0;
-}
diff --git a/test/fortran_test_programs/read_real.F90 b/test/fortran_test_programs/read_real.F90
deleted file mode 100644
index e2ce5dfc2..000000000
--- a/test/fortran_test_programs/read_real.F90
+++ /dev/null
@@ -1,432 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 1 real case library.
-!> This program can read a matrix from an ascii
-!> file and computes then the Eigenvectors.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-program read_real
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - REAL version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!-------------------------------------------------------------------------------
-
-   use precision
-   use ELPA1
-   use elpa_utilities, only : error_unit
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-#ifdef HAVE_REDIRECT
-   use redirect
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
-   use timings
-#endif
-   implicit none
-   include 'mpif.h'
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-
-   integer(kind=ik), parameter :: nblk = 16
-
-   !-------------------------------------------------------------------------------
-   !  Local Variables
-
-   integer(kind=ik)            :: na, nev
-
-   integer(kind=ik)            :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)            :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)            :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol, lenarg
-
-   integer, external           :: numroc
-
-   real(kind=rk)               :: err, errmax
-   real(kind=rk), allocatable  :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
-
-   character*256               :: filename
-#ifdef WITH_OPENMP
-   integer(kind=iK)            :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
-#endif
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-
-#ifndef WITH_OPENMP
-   call mpi_init(mpierr)
-#else
-   required_mpi_thread_level = MPI_THREAD_MULTIPLE
-
-   call mpi_init_thread(required_mpi_thread_level,     &
-                        provided_mpi_thread_level, mpierr)
-
-   if (required_mpi_thread_level .ne. provided_mpi_thread_level) then
-     write(error_unit,*) "MPI ERROR: MPI_THREAD_MULTIPLE is not provided on this system"
-     write(error_unit,*) "           only ", mpi_thread_level_name(provided_mpi_thread_level), " is available"
-     call exit(77)
-   endif
-
-#endif
-   call mpi_comm_rank(mpi_comm_world,myid,mpierr)
-   call mpi_comm_size(mpi_comm_world,nprocs,mpierr)
-
-#ifdef HAVE_REDIRECT
-   if (check_redirect_environment_variable()) then
-     if (myid .eq. 0) then
-       print *," "
-       print *,"Redirection of mpi processes is used"
-       print *," "
-       if (create_directories() .ne. 1) then
-         write(error_unit,*) "Unable to create directory for stdout and stderr!"
-         stop
-       endif
-     endif
-     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
-     call redirect_stdout(myid)
-   endif
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-   !-------------------------------------------------------------------------------
-   ! Get the name of the input file containing the matrix and open input file
-   ! Please note:
-   ! get_command_argument is a FORTRAN 2003 intrinsic which may not be implemented
-   ! for every Fortran compiler!!!
-
-   if(myid==0) then
-      call get_command_argument(1,filename,lenarg,info)
-      if(info/=0) then
-         write(error_unit,*) 'Usage: test_real matrix_file'
-         call mpi_abort(mpi_comm_world,1,mpierr)
-      endif
-      open(10,file=filename,action='READ',status='OLD',iostat=info)
-      if(info/=0) then
-         write(error_unit,*) 'Error: Unable to open ',trim(filename)
-         call mpi_abort(mpi_comm_world,1,mpierr)
-      endif
-   endif
-   call mpi_barrier(mpi_comm_world, mpierr) ! Just for safety
-
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - REAL version'
-      print *
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   my_blacs_ctxt = mpi_comm_world
-   call BLACS_Gridinit( my_blacs_ctxt, 'C', np_rows, np_cols )
-   call BLACS_Gridinfo( my_blacs_ctxt, nprow, npcol, my_prow, my_pcol )
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators
-
-   call get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                               mpi_comm_rows, mpi_comm_cols)
-
-   ! Read matrix size
-   if(myid==0) read(10,*) na
-   call mpi_bcast(na, 1, mpi_integer, 0, mpi_comm_world, mpierr)
-
-   ! Quick check for plausibility
-   if(na<=0 .or. na>10000000) then
-      if(myid==0) write(error_unit,*) 'Illegal value for matrix size: ',na
-      call mpi_finalize(mpierr)
-      stop
-   endif
-   if(myid==0) print *,'Matrix size: ',na
-
-   ! Determine the necessary size of the distributed matrices,
-   ! we use the Scalapack tools routine NUMROC for that.
-
-   na_rows = numroc(na, nblk, my_prow, 0, np_rows)
-   na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
-
-   ! Set up a scalapack descriptor for the checks below.
-   ! For ELPA the following restrictions hold:
-   ! - block sizes in both directions must be identical (args 4+5)
-   ! - first row and column of the distributed matrix must be on row/col 0/0 (args 6+7)
-
-   call descinit( sc_desc, na, na, nblk, nblk, 0, 0, my_blacs_ctxt, na_rows, info )
-
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-
-   !-------------------------------------------------------------------------------
-   ! Read matrix
-
-   call read_matrix(10, na, a, ubound(a,1), nblk, my_prow, my_pcol, np_rows, np_cols)
-   if(myid==0) close(10)
-
-   nev = na ! all eigenvaules
-
-   ! Save original matrix A for later accuracy checks
-
-   as = a
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-   call solve_evp_real_1stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                       mpi_comm_rows, mpi_comm_cols)
-
-   if(myid == 0) print *,'Time tridiag_real :',time_evp_fwd
-   if(myid == 0) print *,'Time solve_tridi  :',time_evp_solve
-   if(myid == 0) print *,'Time trans_ev_real:',time_evp_back
-
-   if(myid == 0) then
-      do i=1,nev
-         print '(i6,g25.15)',i,ev(i)
-      enddo
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-
-   deallocate(a)
-   allocate(tmp1(na_rows,na_cols))
-
-   ! 1. Residual (maximum of || A*Zi - Zi*EVi ||)
-
-   ! tmp1 =  A * Z
-   call pdgemm('N','N',na,nev,na,1.d0,as,1,1,sc_desc, &
-           z,1,1,sc_desc,0.d0,tmp1,1,1,sc_desc)
-
-   deallocate(as)
-   allocate(tmp2(na_rows,na_cols))
-
-   ! tmp2 = Zi*EVi
-   tmp2(:,:) = z(:,:)
-   do i=1,nev
-      call pdscal(na,ev(i),tmp2,1,i,sc_desc,1)
-   enddo
-
-   !  tmp1 = A*Zi - Zi*EVi
-   tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
-
-   ! Get maximum norm of columns of tmp1
-   errmax = 0
-   do i=1,nev
-      err = 0
-      call pdnrm2(na,err,tmp1,1,i,sc_desc,1)
-      errmax = max(errmax, err)
-   enddo
-
-   ! Get maximum error norm over all processors
-   err = errmax
-   call mpi_allreduce(err,errmax,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,mpierr)
-   if(myid==0) print *
-   if(myid==0) print *,'Error Residual     :',errmax
-
-   ! 2. Eigenvector orthogonality
-
-   ! tmp1 = Z**T * Z
-   tmp1 = 0
-   call pdgemm('T','N',nev,nev,na,1.d0,z,1,1,sc_desc, &
-           z,1,1,sc_desc,0.d0,tmp1,1,1,sc_desc)
-   ! Initialize tmp2 to unit matrix
-   tmp2 = 0
-   call pdlaset('A',nev,nev,0.d0,1.d0,tmp2,1,1,sc_desc)
-
-   ! tmp1 = Z**T * Z - Unit Matrix
-   tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
-
-   ! Get maximum error (max abs value in tmp1)
-   err = maxval(abs(tmp1))
-   call mpi_allreduce(err,errmax,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,mpierr)
-   if(myid==0) print *,'Error Orthogonality:',errmax
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-#endif
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-
-end
-
-!-------------------------------------------------------------------------------
-subroutine read_matrix(iunit, na, a, lda, nblk, my_prow, my_pcol, np_rows, np_cols)
-
-   implicit none
-   include 'mpif.h'
-
-   integer, intent(in) :: iunit, na, lda, nblk, my_prow, my_pcol, np_rows, np_cols
-   real*8, intent(out) :: a(lda, *)
-
-   integer i, j, lr, lc, myid, mpierr
-   integer, allocatable :: l_row(:), l_col(:)
-
-   real*8, allocatable :: col(:)
-
-   ! allocate and set index arrays
-
-   allocate(l_row(na))
-   allocate(l_col(na))
-
-   ! Mapping of global rows/cols to local
-
-   l_row(:) = 0
-   l_col(:) = 0
-
-   lr = 0 ! local row counter
-   lc = 0 ! local column counter
-
-   do i = 1, na
-
-     if( MOD((i-1)/nblk,np_rows) == my_prow) then
-       ! row i is on local processor
-       lr = lr+1
-       l_row(i) = lr
-     endif
-
-     if( MOD((i-1)/nblk,np_cols) == my_pcol) then
-       ! column i is on local processor
-       lc = lc+1
-       l_col(i) = lc
-     endif
-
-   enddo
-
-   call mpi_comm_rank(mpi_comm_world,myid,mpierr)
-   allocate(col(na))
-
-   do i=1,na
-      if(myid==0) read(iunit,*) col(1:i)
-      call mpi_bcast(col,i,MPI_REAL8,0,MPI_COMM_WORLD,mpierr)
-      if(l_col(i) > 0) then
-         do j=1,i
-            if(l_row(j)>0) a(l_row(j),l_col(i)) = col(j)
-         enddo
-      endif
-      if(l_row(i) > 0) then
-         do j=1,i-1
-            if(l_col(j)>0) a(l_row(i),l_col(j)) = col(j)
-         enddo
-      endif
-   enddo
-
-   deallocate(l_row, l_col, col)
-
-end subroutine read_matrix
diff --git a/test/fortran_test_programs/test_complex.F90 b/test/fortran_test_programs/test_complex.F90
deleted file mode 100644
index 7a0e91653..000000000
--- a/test/fortran_test_programs/test_complex.F90
+++ /dev/null
@@ -1,332 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 1 complex case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-program test_complex
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - COMPLEX version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!-------------------------------------------------------------------------------
-   use precision
-   use ELPA1
-   use elpa_utilities, only : error_unit
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-#ifdef HAVE_REDIRECT
-   use redirect
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
-   implicit none
-   include 'mpif.h'
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-
-   integer(kind=ik)              :: nblk
-   integer(kind=ik)              :: na, nev
-
-   integer(kind=ik)              :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)              :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)              :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
-
-   real(kind=rk), allocatable    :: ev(:), xr(:,:)
-
-   complex(kind=ck), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:)
-
-   complex(kind=ck), parameter   :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
-
-   integer(kind=ik)              :: iseed(4096) ! Random seed, size should be sufficient for every generator
-   integer(kind=ik)              :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)              :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
-#endif
-   logical                       :: write_to_file
-   logical                       :: success
-
-   success = .true.
-   ! read input parameters if they are provided
-   call read_input_parameters(na, nev, nblk, write_to_file)
-
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-
-   STATUS = 0
-#ifdef WITH_OPENMP
-   if (myid .eq. 0) then
-      print *,"Threaded version of test program"
-      print *,"Using ",omp_get_max_threads()," threads"
-      print *," "
-   endif
-#endif
-
-#ifdef HAVE_REDIRECT
-   if (check_redirect_environment_variable()) then
-     if (myid .eq. 0) then
-       print *," "
-       print *,"Redirection of mpi processes is used"
-       print *," "
-       if (create_directories() .ne. 1) then
-         write(error_unit,*) "Unable to create directory for stdout and stderr!"
-         stop
-       endif
-     endif
-     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
-     call redirect_stdout(myid)
-   endif
-#endif
-   if (write_to_file) then
-     if (myid .eq. 0) print *,"Writing output files"
-   endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - COMPLEX version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators.
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   ! Determine the necessary size of the distributed matrices,
-   ! we use the Scalapack tools routine NUMROC for that.
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-
-   allocate(xr(na_rows,na_cols))
-
-   call prepare_matrix(na, myid, sc_desc, iseed, xr, a, z, as)
-
-   deallocate(xr)
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   if (myid==0) then
-     print '(a)','| Entering one-step ELPA solver ... '
-     print *
-   end if
-
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-   success = solve_evp_complex_1stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                               na_cols, mpi_comm_rows, mpi_comm_cols)
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_complex produced an error! Aborting..."
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-   endif
-
-   if (myid==0) then
-     print '(a)','| One-step ELPA solver complete.'
-     print *
-   end if
-
-   if(myid == 0) print *,'Time tridiag_complex  :',time_evp_fwd
-   if(myid == 0) print *,'Time solve_tridi      :',time_evp_solve
-   if(myid == 0) print *,'Time trans_ev_complex :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above):',time_evp_back+time_evp_solve+time_evp_fwd
-
-   if(write_to_file) then
-      if (myid == 0) then
-         open(17,file="EVs_complex_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-#endif
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-   call EXIT(STATUS)
-end
-
-!-------------------------------------------------------------------------------
diff --git a/test/fortran_test_programs/test_complex2.F90 b/test/fortran_test_programs/test_complex2.F90
deleted file mode 100644
index 79ecfaac5..000000000
--- a/test/fortran_test_programs/test_complex2.F90
+++ /dev/null
@@ -1,360 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 2 complex case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-!> The complex ELPA 2 kernel is set as the default kernel.
-!> However, this can be overriden by setting
-!> the environment variable "COMPLEX_ELPA_KERNEL" to an
-!> appropiate value.
-!>
-program test_complex2
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - COMPLEX version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!-------------------------------------------------------------------------------
-   use precision
-   use ELPA1
-   use ELPA2
-   use elpa_utilities, only : error_unit
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-
-#ifdef HAVE_REDIRECT
-  use redirect
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
-   implicit none
-   include 'mpif.h'
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-
-   integer(kind=ik)              :: nblk
-   integer(kind=ik)              :: na, nev
-
-   integer(kind=ik)              :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)              :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)              :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
-
-   integer, external             :: numroc
-
-   complex(kind=ck), parameter   :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
-   real(kind=rk), allocatable    :: ev(:), xr(:,:)
-
-   complex(kind=ck), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:)
-
-
-   integer(kind=ik)              :: iseed(4096) ! Random seed, size should be sufficient for every generator
-
-   integer(kind=ik)              :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)              :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
-#endif
-   logical                       :: write_to_file
-   logical                       :: success
-
-   success = .true.
-
-   call read_input_parameters(na, nev, nblk, write_to_file)
-      !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-
-   STATUS = 0
-
-#ifdef WITH_OPENMP
-   if (myid .eq. 0) then
-      print *,"Threaded version of test program"
-      print *,"Using ",omp_get_max_threads()," threads"
-      print *," "
-   endif
-#endif
-#ifdef HAVE_REDIRECT
-   if (check_redirect_environment_variable()) then
-     if (myid .eq. 0) then
-       print *," "
-       print *,"Redirection of mpi processes is used"
-       print *," "
-       if (create_directories() .ne. 1) then
-         write(error_unit,*) "Unable to create directory for stdout and stderr!"
-         stop
-       endif
-     endif
-     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
-     call redirect_stdout(myid)
-   endif
-#endif
-
-   if (myid .eq. 0) then
-      print *," "
-      print *,"This ELPA2 is build with"
-#ifdef  WITH_COMPLEX_AVX_BLOCK2_KERNEL
-      print *,"AVX optimized kernel (2 blocking) for complex matrices"
-#endif
-#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
-      print *,"AVX optimized kernel (1 blocking) for complex matrices"
-#endif
-
-#ifdef WITH_COMPLEX_GENERIC_KERNEL
-     print *,"GENERIC kernel for complex matrices"
-#endif
-#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
-     print *,"GENERIC SIMPLE kernel for complex matrices"
-#endif
-#ifdef WITH_COMPLEX_SSE_KERNEL
-     print *,"SSE ASSEMBLER kernel for complex matrices"
-#endif
-
-   endif
-
-   if (write_to_file) then
-     if (myid .eq. 0) print *,"Writing output files"
-   endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - COMPLEX version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators.
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   ! Determine the necessary size of the distributed matrices,
-   ! we use the Scalapack tools routine NUMROC for that.
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-   allocate(xr(na_rows,na_cols))
-
-   call prepare_matrix(na, myid, sc_desc, iseed, xr, a, z, as)
-
-   deallocate(xr)
-
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-
-   ! set print flag in elpa1
-   elpa_print_times = .true.
-
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-   success = solve_evp_complex_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                                 na_cols, &
-                                 mpi_comm_rows, mpi_comm_cols, mpi_comm_world)
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_complex_2stage produced an error! Aborting..."
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-   endif
-
-   if(myid == 0) print *,'Time transform to tridi :',time_evp_fwd
-   if(myid == 0) print *,'Time solve tridi        :',time_evp_solve
-   if(myid == 0) print *,'Time transform back EVs :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above)  :',time_evp_back+time_evp_solve+time_evp_fwd
-
-   if(write_to_file) then
-      if (myid == 0) then
-         open(17,file="EVs_complex2_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-#endif
-
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-!   call EXIT(STATUS)
-end
-
-!-------------------------------------------------------------------------------
diff --git a/test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 b/test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90
deleted file mode 100644
index 48777705a..000000000
--- a/test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90
+++ /dev/null
@@ -1,364 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 2 complex case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-!> The complex ELPA 2 kernel is set in this program via
-!> the API call. However, this can be overriden by setting
-!> the environment variable "COMPLEX_ELPA_KERNEL" to an
-!> appropiate value.
-!>
-program test_complex2
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - COMPLEX version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!-------------------------------------------------------------------------------
-
-   use precision
-   use ELPA1
-   use ELPA2
-   use elpa_utilities, only : error_unit
-   use elpa2_utilities
-
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-#ifdef HAVE_REDIRECT
-  use redirect
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
-
-   implicit none
-   include 'mpif.h'
-
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-
-   integer(kind=ik)              :: nblk
-   integer(kind=ik)              :: na, nev
-   integer(kind=ik)              :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)              :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)              :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
-
-   integer, external             :: numroc
-
-   real(kind=rk), allocatable    :: ev(:), xr(:,:)
-
-   complex(kind=ck), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:)
-
-   complex(kind=ck), parameter   :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
-
-   integer(kind=ik)              :: iseed(4096) ! Random seed, size should be sufficient for every generator
-
-   integer(kind=ik)              :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)              :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
-#endif
-   logical                       :: write_to_file
-   logical                       :: success
-
-   success = .true.
-
-   call read_input_parameters(na, nev, nblk, write_to_file)
-
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-   STATUS = 0
-
-#ifdef WITH_OPENMP
-   if (myid .eq. 0) then
-      print *,"Threaded version of test program"
-      print *,"Using ",omp_get_max_threads()," threads"
-      print *," "
-   endif
-#endif
-
-#ifdef HAVE_REDIRECT
-   if (check_redirect_environment_variable()) then
-     if (myid .eq. 0) then
-       print *," "
-       print *,"Redirection of mpi processes is used"
-       print *," "
-       if (create_directories() .ne. 1) then
-         write(error_unit,*) "Unable to create directory for stdout and stderr!"
-         stop
-       endif
-     endif
-     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
-     call redirect_stdout(myid)
-   endif
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-
-   if (write_to_file) then
-     if (myid .eq. 0) print *,"Writing output files"
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - COMPLEX version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-      print *, "This is an example how to determine the ELPA2 kernel with"
-      print *, "an api call. Note, however, that setting the kernel via"
-      print *, "an environment variable will always take precedence over"
-      print *, "everything else! "
-      print *
-#ifndef HAVE_ENVIRONMENT_CHECKING
-      print *, " Notice that it is not possible with this build to set the "
-      print *, " kernel via an environment variable! To change this re-install"
-      print *, " the library and have a look at the log files"
-#endif
-      print *, " The settings are: COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE"
-      print *
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   ! Determine the necessary size of the distributed matrices,
-   ! we use the Scalapack tools routine NUMROC for that.
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-   allocate(xr(na_rows,na_cols))
-
-   call prepare_matrix(na, myid, sc_desc, iseed, xr, a, z, as)
-
-   deallocate(xr)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-
-   ! set print flag in elpa1
-   elpa_print_times = .true.
-
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   if (myid==0) then
-     print '(a)','| Entering two-stage ELPA solver ... '
-     print *
-   end if
-
-
-   ! ELPA is called a kernel specification in the API
-
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-   success = solve_evp_complex_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                                 na_cols, &
-                                 mpi_comm_rows, mpi_comm_cols, mpi_comm_world, &
-                                 COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE)
-
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_complex_2stage produced an error! Aborting..."
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-   endif
-
-   if(myid == 0) print *,'Time transform to tridi :',time_evp_fwd
-   if(myid == 0) print *,'Time solve tridi        :',time_evp_solve
-   if(myid == 0) print *,'Time transform back EVs :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above)  :',time_evp_back+time_evp_solve+time_evp_fwd
-
-   if(write_to_file) then
-      if (myid == 0) then
-         open(17,file="EVs_complex2_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-#endif
-
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-   call EXIT(STATUS)
-end
-
-!-------------------------------------------------------------------------------
diff --git a/test/fortran_test_programs/test_complex2_default_kernel.F90 b/test/fortran_test_programs/test_complex2_default_kernel.F90
deleted file mode 100644
index 4797336bc..000000000
--- a/test/fortran_test_programs/test_complex2_default_kernel.F90
+++ /dev/null
@@ -1,362 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 2 complex case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-!> The complex ELPA 2 kernel is set as the default kernel.
-!> However, this can be overriden by setting
-!> the environment variable "COMPLEX_ELPA_KERNEL" to an
-!> appropiate value.
-!>
-program test_complex2
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - COMPLEX version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!-------------------------------------------------------------------------------
-   use precision
-   use ELPA1
-   use ELPA2
-
-   use elpa_utilities, only : error_unit
-   use elpa2_utilities
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-#ifdef HAVE_REDIRECT
-  use redirect
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
-   implicit none
-   include 'mpif.h'
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-
-   integer(kind=ik)              :: nblk
-   integer(kind=ik)              :: na, nev
-
-   integer(kind=ik)              :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)              :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)              :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
-
-   integer, external             :: numroc
-
-   real(kind=rk), allocatable    :: ev(:), xr(:,:)
-
-   complex(kind=ck), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:)
-
-   complex(kind=ck), parameter   :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
-
-   integer(kind=ik)              :: iseed(4096) ! Random seed, size should be sufficient for every generator
-
-   integer(kind=ik)              :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)              :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
-#endif
-   logical                       :: write_to_file
-   logical                       :: success
-
-   success = .true.
-
-   call read_input_parameters(na, nev, nblk, write_to_file)
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-
-   STATUS = 0
-
-#ifdef WITH_OPENMP
-   if (myid .eq. 0) then
-      print *,"Threaded version of test program"
-      print *,"Using ",omp_get_max_threads()," threads"
-      print *," "
-   endif
-#endif
-#ifdef HAVE_REDIRECT
-   if (check_redirect_environment_variable()) then
-     if (myid .eq. 0) then
-       print *," "
-       print *,"Redirection of mpi processes is used"
-       print *," "
-       if (create_directories() .ne. 1) then
-         write(error_unit,*) "Unable to create directory for stdout and stderr!"
-         stop
-       endif
-     endif
-     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
-     call redirect_stdout(myid)
-   endif
-#endif
-
-   if (write_to_file) then
-     if (myid .eq. 0) print *,"Writing output files"
-   endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - COMPLEX version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-      print *, "This is an example how ELPA2 chooses a default kernel,"
-#ifdef HAVE_ENVIRONMENT_CHECKING
-      print *, "or takes the kernel defined in the environment variable,"
-#endif
-      print *, "since the ELPA API call does not contain any kernel specification"
-      print *
-      print *, " The settings are: ",trim(get_actual_complex_kernel_name())," as complex kernel"
-      print *
-#ifndef HAVE_ENVIRONMENT_CHECKING
-      print *, " Notice that it is not possible with this build to set the "
-      print *, " kernel via an environment variable! To change this re-install"
-      print *, " the library and have a look at the log files"
-#endif
-
-
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   ! Determine the necessary size of the distributed matrices,
-   ! we use the Scalapack tools routine NUMROC for that.
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-   allocate(xr(na_rows,na_cols))
-
-   call prepare_matrix(na, myid, sc_desc, iseed, xr, a, z, as)
-
-   deallocate(xr)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-   ! set print flag in elpa1
-   elpa_print_times = .true.
-
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   if (myid==0) then
-     print '(a)','| Entering two-stage ELPA solver ... '
-     print *
-   end if
-
-
-   ! ELPA is called without any kernel specification in the API,
-   ! furthermore, if the environment variable is not set, the
-   ! default kernel is called. Otherwise, the kernel defined in the
-   ! environment variable
-
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-   success = solve_evp_complex_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                                 na_cols, &
-                                 mpi_comm_rows, mpi_comm_cols, mpi_comm_world)
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_complex_2stage produced an error! Aborting..."
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-   endif
-
-   if(myid == 0) print *,'Time transform to tridi :',time_evp_fwd
-   if(myid == 0) print *,'Time solve tridi        :',time_evp_solve
-   if(myid == 0) print *,'Time transform back EVs :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above)  :',time_evp_back+time_evp_solve+time_evp_fwd
-
-   if(write_to_file) then
-      if (myid == 0) then
-         open(17,file="EVs_complex2_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-#endif
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-   call EXIT(STATUS)
-end
-
-!-------------------------------------------------------------------------------
diff --git a/test/fortran_test_programs/test_real.F90 b/test/fortran_test_programs/test_real.F90
deleted file mode 100644
index e7d16a3de..000000000
--- a/test/fortran_test_programs/test_real.F90
+++ /dev/null
@@ -1,338 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 1 real case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-program test_real
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - REAL version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-!-------------------------------------------------------------------------------
-   use precision
-   use ELPA1
-   use elpa_utilities, only : error_unit
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-
-#ifdef HAVE_REDIRECT
-  use redirect
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
-
-   implicit none
-   include 'mpif.h'
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-   integer(kind=ik)           :: nblk
-   integer(kind=ik)           :: na, nev
-
-   integer(kind=ik)           :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)           :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)           :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
-
-   integer, external          :: numroc
-
-   real(kind=rk), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
-
-   integer(kind=ik)           :: iseed(4096) ! Random seed, size should be sufficient for every generator
-
-   integer(kind=ik)           :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)           :: omp_get_max_threads,  required_mpi_thread_level, &
-                                 provided_mpi_thread_level
-#endif
-   logical                    :: write_to_file
-   logical                    :: success
-   !-------------------------------------------------------------------------------
-
-   success = .true.
-
-   call read_input_parameters(na, nev, nblk, write_to_file)
-
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-
-   if (write_to_file) then
-     if (myid .eq. 0) print *,"Writing output files"
-   endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-
-   STATUS = 0
-#ifdef WITH_OPENMP
-   if (myid .eq. 0) then
-      print *,"Threaded version of test program"
-      print *,"Using ",omp_get_max_threads()," threads"
-      print *," "
-   endif
-#endif
-    call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
-
-#ifdef HAVE_REDIRECT
-   if (check_redirect_environment_variable()) then
-     if (myid .eq. 0) then
-       print *," "
-       print *,"Redirection of mpi processes is used"
-       print *," "
-       if (create_directories() .ne. 1) then
-         write(error_unit,*) "Unable to create directory for stdout and stderr!"
-         stop
-       endif
-      endif
-      call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
-      call redirect_stdout(myid)
-    endif
-#endif
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - REAL version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators.
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-
-   call prepare_matrix(na, myid, sc_desc, iseed,  a, z, as)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   if (myid==0) then
-     print '(a)','| Entering one-step ELPA solver ... '
-     print *
-   end if
-
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-   success = solve_evp_real_1stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                            na_cols, mpi_comm_rows, mpi_comm_cols)
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_real_1stage produced an error! Aborting..."
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-   endif
-
-
-   if (myid==0) then
-     print '(a)','| One-step ELPA solver complete.'
-     print *
-   end if
-
-   if(myid == 0) print *,'Time tridiag_real     :',time_evp_fwd
-   if(myid == 0) print *,'Time solve_tridi      :',time_evp_solve
-   if(myid == 0) print *,'Time trans_ev_real    :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above):',time_evp_back+time_evp_solve+time_evp_fwd
-   if(write_to_file) then
-      if (myid == 0) then
-         open(17,file="EVs_real_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   print *," "
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-   print *," "
-   print *,"End timings program"
-#endif
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-
-   call EXIT(STATUS)
-
-
-end
-
-!-------------------------------------------------------------------------------
diff --git a/test/fortran_test_programs/test_real2.F90 b/test/fortran_test_programs/test_real2.F90
deleted file mode 100644
index d5842ecb3..000000000
--- a/test/fortran_test_programs/test_real2.F90
+++ /dev/null
@@ -1,364 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 2 real case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-!> The real ELPA 2 kernel is set as the default kernel.
-!> However, this can be overriden by setting
-!> the environment variable "REAL_ELPA_KERNEL" to an
-!> appropiate value.
-!>
-program test_real2
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - REAL version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-!-------------------------------------------------------------------------------
-   use precision
-   use ELPA1
-   use ELPA2
-   use elpa_utilities, only : error_unit
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-
-#ifdef HAVE_REDIRECT
-  use redirect
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
-   implicit none
-   include 'mpif.h'
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-
-   integer(kind=ik)           :: nblk
-   integer(kind=ik)           :: na, nev
-
-   integer(kind=ik)           :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)           :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)           :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
-
-   integer, external          :: numroc
-
-   real(kind=rk), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
-
-   integer(kind=ik)           :: iseed(4096) ! Random seed, size should be sufficient for every generator
-   integer(kind=ik)           :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)           :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
-#endif
-   logical                    :: write_to_file
-   logical                    :: success
-
-   success = .true.
-
-   call read_input_parameters(na, nev, nblk, write_to_file)
-
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-
-   STATUS = 0
-#ifdef WITH_OPENMP
-   if (myid .eq. 0) then
-      print *,"Threaded version of test program"
-      print *,"Using ",omp_get_max_threads()," threads"
-      print *," "
-   endif
-#endif
-
-#ifdef HAVE_REDIRECT
-   if (check_redirect_environment_variable()) then
-     if (myid .eq. 0) then
-       print *," "
-       print *,"Redirection of mpi processes is used"
-       print *," "
-       if (create_directories() .ne. 1) then
-         write(error_unit,*) "Unable to create directory for stdout and stderr!"
-         stop
-       endif
-     endif
-     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
-     call redirect_stdout(myid)
-   endif
-#endif
-
-   if (myid .eq. 0) then
-      print *," "
-      print *,"This ELPA2 is build with"
-#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
-      print *,"AVX optimized kernel (2 blocking) for real matrices"
-#endif
-#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
-      print *,"AVX optimized kernel (4 blocking) for real matrices"
-#endif
-#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
-      print *,"AVX optimized kernel (6 blocking) for real matrices"
-#endif
-
-#ifdef WITH_REAL_GENERIC_KERNEL
-     print *,"GENERIC kernel for real matrices"
-#endif
-#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
-     print *,"GENERIC SIMPLE kernel for real matrices"
-#endif
-#ifdef WITH_REAL_SSE_KERNEL
-     print *,"SSE ASSEMBLER kernel for real matrices"
-#endif
-#ifdef WITH_REAL_BGP_KERNEL
-     print *,"BGP kernel for real matrices"
-#endif
-#ifdef WITH_REAL_BGQ_KERNEL
-     print *,"BGQ kernel for real matrices"
-#endif
-   endif
-   if (write_to_file) then
-     if (myid .eq. 0) print *,"Writing output files"
-   endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - REAL version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators.
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-
-   call prepare_matrix(na, myid, sc_desc, iseed,  a, z, as)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-   ! set print flag in elpa1
-   elpa_print_times = .true.
-
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   if (myid==0) then
-     print '(a)','| Entering two-stage ELPA solver ... '
-     print *
-   end if
-
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-   success = solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows,  nblk, na_cols, &
-                              mpi_comm_rows, mpi_comm_cols, mpi_comm_world)
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_real_2stage produced an error! Aborting..."
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-   endif
-
-   if (myid==0) then
-     print '(a)','| Two-step ELPA solver complete.'
-     print *
-   end if
-
-   if(myid == 0) print *,'Time transform to tridi :',time_evp_fwd
-   if(myid == 0) print *,'Time solve tridi        :',time_evp_solve
-   if(myid == 0) print *,'Time transform back EVs :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above)  :',time_evp_back+time_evp_solve+time_evp_fwd
-
-
-   if(write_to_file) then
-      if (myid == 0) then
-         open(17,file="EVs_real2_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-#endif
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-   call EXIT(STATUS)
-end
-
-!-------------------------------------------------------------------------------
diff --git a/test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 b/test/fortran_test_programs/test_real2_choose_kernel_with_api.F90
deleted file mode 100644
index ec2a56ca5..000000000
--- a/test/fortran_test_programs/test_real2_choose_kernel_with_api.F90
+++ /dev/null
@@ -1,357 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 2 real case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-!> The complex ELPA 2 kernel is set in this program via
-!> the API call. However, this can be overriden by setting
-!> the environment variable "REAL_ELPA_KERNEL" to an
-!> appropiate value.
-!>
-program test_real2
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - REAL version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-!-------------------------------------------------------------------------------
-   use precision
-   use ELPA1
-   use ELPA2
-
-   use elpa_utilities, only : error_unit
-   use elpa2_utilities
-
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-#ifdef HAVE_REDIRECT
-  use redirect
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
-
-   implicit none
-   include 'mpif.h'
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-
-   integer(kind=ik)           :: nblk
-   integer(kind=ik)           :: na, nev
-
-   integer(kind=ik)           :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)           :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)           :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
-
-   integer, external          :: numroc
-
-   real(kind=rk), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
-
-   integer(kind=ik)           :: iseed(4096) ! Random seed, size should be sufficient for every generator
-   integer(kind=ik)           :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)           :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
-#endif
-   logical                    :: write_to_file
-   logical                    :: success
-
-   success = .true.
-
-   call read_input_parameters(na, nev, nblk, write_to_file)
-
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-
-   STATUS = 0
-
-#ifdef WITH_OPENMP
-   if (myid .eq. 0) then
-      print *,"Threaded version of test program"
-      print *,"Using ",omp_get_max_threads()," threads"
-      print *," "
-   endif
-#endif
-#ifdef HAVE_REDIRECT
-   if (check_redirect_environment_variable()) then
-     if (myid .eq. 0) then
-       print *," "
-       print *,"Redirection of mpi processes is used"
-       print *," "
-       if (create_directories() .ne. 1) then
-         write(error_unit,*) "Unable to create directory for stdout and stderr!"
-         stop
-       endif
-     endif
-     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
-     call redirect_stdout(myid)
-   endif
-#endif
-   if (write_to_file) then
-     if (myid .eq. 0) print *,"Writing output files"
-   endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - REAL version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-      print *, "This is an example how to determine the ELPA2 kernel with"
-      print *, "an api call. Note, however, that setting the kernel via"
-      print *, "an environment variable will always take precedence over"
-      print *, "everything else! "
-      print *
-#ifndef HAVE_ENVIRONMENT_CHECKING
-      print *, " Notice that it is not possible with this build to set the "
-      print *, " kernel via an environment variable! To change this re-install"
-      print *, " the library and have a look at the log files"
-#endif
-      print *, " The settings are: REAL_ELPA_KERNEL_GENERIC_SIMPLE"
-      print *
-
-
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators.
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-
-   call prepare_matrix(na, myid, sc_desc, iseed,  a, z, as)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-   ! set print flag in elpa1
-   elpa_print_times = .true.
-
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   if (myid==0) then
-     print '(a)','| Entering two-stage ELPA solver ... '
-     print *
-   end if
-
-
-   ! ELPA is called with a kernel specification in the API
-
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-   success = solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                              na_cols, &
-                              mpi_comm_rows, mpi_comm_cols, mpi_comm_world, &
-                              REAL_ELPA_KERNEL_GENERIC_SIMPLE)
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_real_2stage produced an error! Aborting..."
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-   endif
-
-   if (myid==0) then
-     print '(a)','| Two-step ELPA solver complete.'
-     print *
-   end if
-
-   if(myid == 0) print *,'Time transform to tridi :',time_evp_fwd
-   if(myid == 0) print *,'Time solve tridi        :',time_evp_solve
-   if(myid == 0) print *,'Time transform back EVs :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above)  :',time_evp_back+time_evp_solve+time_evp_fwd
-
-   if(write_to_file) then
-      if (myid == 0) then
-         open(17,file="EVs_real2_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-#endif
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-   call EXIT(STATUS)
-end
-
-!-------------------------------------------------------------------------------
diff --git a/test/fortran_test_programs/test_real2_default_kernel.F90 b/test/fortran_test_programs/test_real2_default_kernel.F90
deleted file mode 100644
index 86acd3961..000000000
--- a/test/fortran_test_programs/test_real2_default_kernel.F90
+++ /dev/null
@@ -1,355 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 2 real case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-!> The real ELPA 2 kernel is set as the default kernel.
-!> However, this can be overriden by setting
-!> the environment variable "REAL_ELPA_KERNEL" to an
-!> appropiate value.
-!>
-program test_real2
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - REAL version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-!-------------------------------------------------------------------------------
-   use precision
-   use ELPA1
-   use ELPA2
-   use elpa_utilities, only : error_unit
-   use elpa2_utilities
-
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-#ifdef HAVE_REDIRECT
-  use redirect
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
-   implicit none
-   include 'mpif.h'
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-
-   integer(kind=ik)           :: nblk
-   integer(kind=ik)           :: na, nev
-
-   integer(kind=ik)           :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)           :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)           :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
-
-   integer, external          :: numroc
-
-   real(kind=rk), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
-
-   integer(kind=ik)           :: iseed(4096) ! Random seed, size should be sufficient for every generator
-   integer(kind=ik)           :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)           :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
-#endif
-   logical                    :: write_to_file
-   logical                    :: success
-
-   success = .true.
-
-   call read_input_parameters(na, nev, nblk, write_to_file)
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-
-   STATUS = 0
-
-#ifdef WITH_OPENMP
-   if (myid .eq. 0) then
-      print *,"Threaded version of test program"
-      print *,"Using ",omp_get_max_threads()," threads"
-      print *," "
-   endif
-#endif
-#ifdef HAVE_REDIRECT
-   if (check_redirect_environment_variable()) then
-     if (myid .eq. 0) then
-       print *," "
-       print *,"Redirection of mpi processes is used"
-       print *," "
-       if (create_directories() .ne. 1) then
-         write(error_unit,*) "Unable to create directory for stdout and stderr!"
-         stop
-       endif
-     endif
-     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
-     call redirect_stdout(myid)
-   endif
-#endif
-
-   if (write_to_file) then
-     if (myid .eq. 0) print *,"Writing output files"
-   endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - REAL version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-      print *, "This is an example how ELPA2 chooses a default kernel,"
-#ifdef HAVE_ENVIRONMENT_CHECKING
-      print *, "or takes the kernel defined in the environment variable,"
-#endif
-      print *, "since the ELPA API call does not contain any kernel specification"
-      print *
-      print *, " The settings are: ",trim(get_actual_real_kernel_name())," as real kernel"
-      print *
-#ifndef HAVE_ENVIRONMENT_CHECKING
-      print *, " Notice that it is not possible with this build to set the "
-      print *, " kernel via an environment variable! To change this re-install"
-      print *, " the library and have a look at the log files"
-#endif
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators.
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-
-   call prepare_matrix(na, myid, sc_desc, iseed,  a, z, as)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-   ! set print flag in elpa1
-   elpa_print_times = .true.
-
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   if (myid==0) then
-     print '(a)','| Entering two-stage ELPA solver ... '
-     print *
-   end if
-
-
-   ! ELPA is called without any kernel specification in the API,
-   ! furthermore, if the environment variable is not set, the
-   ! default kernel is called. Otherwise, the kernel defined in the
-   ! environment variable
-
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-   success = solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                              na_cols, &
-                              mpi_comm_rows, mpi_comm_cols, mpi_comm_world)
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_real_2stage produced an error! Aborting..."
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-   endif
-
-   if (myid==0) then
-     print '(a)','| Two-step ELPA solver complete.'
-     print *
-   end if
-
-   if(myid == 0) print *,'Time transform to tridi :',time_evp_fwd
-   if(myid == 0) print *,'Time solve tridi        :',time_evp_solve
-   if(myid == 0) print *,'Time transform back EVs :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above)  :',time_evp_back+time_evp_solve+time_evp_fwd
-
-   if(write_to_file) then
-      if (myid == 0) then
-         open(17,file="EVs_real2_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-#endif
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-   call EXIT(STATUS)
-end
-
-!-------------------------------------------------------------------------------
diff --git a/test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90 b/test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90
deleted file mode 100644
index 368b863e8..000000000
--- a/test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90
+++ /dev/null
@@ -1,367 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 2 real case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-!> The real ELPA 2 kernel is set as the default kernel.
-!> In this test case the qr_decomposition is used.
-!> However, this can be overriden by setting
-!> the environment variable "REAL_ELPA_KERNEL" to an
-!> appropiate value.
-!>
-program test_real2
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - REAL version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-!-------------------------------------------------------------------------------
-   use precision
-   use ELPA1
-   use ELPA2
-   use elpa_utilities, only : error_unit
-   use elpa2_utilities
-
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-#ifdef HAVE_REDIRECT
-  use redirect
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
-   implicit none
-   include 'mpif.h'
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-
-   integer(kind=ik)           :: nblk
-   integer(kind=ik)           :: na, nev
-
-   integer(kind=ik)           :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)           :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)           :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
-
-   integer, external          :: numroc
-
-   real(kind=rk), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
-
-   integer(kind=ik)           :: iseed(4096) ! Random seed, size should be sufficient for every generator
-   integer(kind=ik)           :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)           :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
-#endif
-   logical                    :: write_to_file
-   logical                    :: success
-
-   success = .true.
-   write_to_file = .false.
-
-   if (COMMAND_ARGUMENT_COUNT() /= 0) then
-     write(error_unit,*) "This program does not support any command-line arguments"
-     stop 1
-   endif
-
-   nblk = 2
-   na   = 4000
-   nev  = 1500
-
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-
-   STATUS = 0
-
-#ifdef WITH_OPENMP
-   if (myid .eq. 0) then
-      print *,"Threaded version of test program"
-      print *,"Using ",omp_get_max_threads()," threads"
-      print *," "
-   endif
-#endif
-#ifdef HAVE_REDIRECT
-   if (check_redirect_environment_variable()) then
-     if (myid .eq. 0) then
-       print *," "
-       print *,"Redirection of mpi processes is used"
-       print *," "
-       if (create_directories() .ne. 1) then
-         write(error_unit,*) "Unable to create directory for stdout and stderr!"
-         stop
-       endif
-     endif
-     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
-     call redirect_stdout(myid)
-   endif
-#endif
-
-   if (write_to_file) then
-     if (myid .eq. 0) print *,"Writing output files"
-   endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-   call timer%enable()
-
-   call timer%start("program")
-#endif
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - REAL version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-      print *, "This is an example how ELPA2 chooses a default kernel,"
-#ifdef HAVE_ENVIRONMENT_CHECKING
-      print *, "or takes the kernel defined in the environment variable,"
-#endif
-      print *, "since the ELPA API call does not contain any kernel specification"
-      print *
-      print *, " The settings are: ",trim(get_actual_real_kernel_name())," as real kernel"
-      print *
-#ifndef HAVE_ENVIRONMENT_CHECKING
-      print *, " Notice that it is not possible with this build to set the "
-      print *, " kernel via an environment variable! To change this re-install"
-      print *, " the library and have a look at the log files"
-#endif
-      print *, " The qr-decomposition is used via the api call"
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators.
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-
-   call prepare_matrix(na, myid, sc_desc, iseed,  a, z, as)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-   ! set print flag in elpa1
-   elpa_print_times = .true.
-
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   if (myid==0) then
-     print '(a)','| Entering two-stage ELPA solver ... '
-     print *
-   end if
-
-
-   ! ELPA is called without any kernel specification in the API,
-   ! furthermore, if the environment variable is not set, the
-   ! default kernel is called. Otherwise, the kernel defined in the
-   ! environment variable
-
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-   success = solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                              na_cols,                                        &
-                              mpi_comm_rows, mpi_comm_cols, mpi_comm_world,   &
-                              useQR=.true.)
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_real_2stage produced an error! Aborting..."
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-   endif
-
-   if (myid==0) then
-     print '(a)','| Two-step ELPA solver complete.'
-     print *
-   end if
-
-   if(myid == 0) print *,'Time transform to tridi :',time_evp_fwd
-   if(myid == 0) print *,'Time solve tridi        :',time_evp_solve
-   if(myid == 0) print *,'Time transform back EVs :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above)  :',time_evp_back+time_evp_solve+time_evp_fwd
-
-   if(write_to_file) then
-      if (myid == 0) then
-         open(17,file="EVs_real2_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-#endif
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-   call EXIT(STATUS)
-end
-
-!-------------------------------------------------------------------------------
diff --git a/test/fortran_test_programs/test_real_with_c.F90 b/test/fortran_test_programs/test_real_with_c.F90
deleted file mode 100644
index d936f303c..000000000
--- a/test/fortran_test_programs/test_real_with_c.F90
+++ /dev/null
@@ -1,424 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 1 real case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-program test_real
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - REAL version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-!-------------------------------------------------------------------------------
-   use precision
-   use ELPA1
-   use elpa_utilities, only : error_unit
-   use from_c
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-
-#ifdef HAVE_REDIRECT
-  use redirect
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
-
-   implicit none
-   include 'mpif.h'
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-   integer(kind=ik)           :: nblk
-   integer(kind=ik)           :: na, nev
-
-   integer(kind=ik)           :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)           :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)           :: mpi_comm_rows_fromC, mpi_comm_cols_fromC
-   integer(kind=ik)           :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol,j
-
-   integer(kind=ik)           :: my_prowFromC, my_pcolFromC
-   integer, external          :: numroc
-
-   real(kind=rk), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
-
-   real(kind=rk), allocatable :: aFromC(:,:), evFromC(:), zFromC(:,:)
-
-   integer(kind=ik)           :: iseed(4096) ! Random seed, size should be sufficient for every generator
-
-   integer(kind=ik)           :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)           :: omp_get_max_threads,  required_mpi_thread_level, &
-                                 provided_mpi_thread_level
-#endif
-   logical                    :: write_to_file
-
-   integer(kind=ik)           :: checksWrong, checksWrongRecv
-   logical                    :: success
-
-   success = .true.
-
-   call read_input_parameters(na, nev, nblk, write_to_file)
-
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-
-   if (write_to_file) then
-     if (myid .eq. 0) print *,"Writing output files"
-   endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-
-   STATUS = 0
-#ifdef WITH_OPENMP
-   if (myid .eq. 0) then
-      print *,"Threaded version of test program"
-      print *,"Using ",omp_get_max_threads()," threads"
-      print *," "
-   endif
-#endif
-    call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
-
-#ifdef HAVE_REDIRECT
-   if (check_redirect_environment_variable()) then
-     if (myid .eq. 0) then
-       print *," "
-       print *,"Redirection of mpi processes is used"
-       print *," "
-       if (create_directories() .ne. 1) then
-         write(error_unit,*) "Unable to create directory for stdout and stderr!"
-         stop
-       endif
-      endif
-      call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
-      call redirect_stdout(myid)
-    endif
-#endif
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - REAL version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   my_prowFromC = my_prow
-   my_pcolFromC = my_pcol
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators.
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   ! call here a c function, which via the c-interface in turn calls the
-   ! appropiate elpa function
-   mpierr = call_elpa_get_comm_from_c(mpi_comm_world, my_prowFromC, my_pcolFromC, &
-                                      mpi_comm_rows_fromC, mpi_comm_cols_fromC)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-
-   allocate(aFromC (na_rows,na_cols))
-   allocate(zFromC (na_rows,na_cols))
-
-   allocate(evFromC(na))
-
-   call prepare_matrix(na, myid, sc_desc, iseed,  a, z, as)
-
-   aFromC = a
-   zFromC = z
-   evFromC = ev
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   if (myid==0) then
-     print '(a)','| Entering one-step ELPA solver ... '
-     print *
-   end if
-
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-   success = solve_evp_real_1stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                          na_cols, mpi_comm_rows, mpi_comm_cols)
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_real_1stage produced an error! Aborting..."
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-   endif
-
-
-   if (myid==0) then
-     print '(a)','| One-step ELPA solver complete.'
-     print *
-   end if
-
-   if(myid == 0) print *,'Time tridiag_real     :',time_evp_fwd
-   if(myid == 0) print *,'Time solve_tridi      :',time_evp_solve
-   if(myid == 0) print *,'Time trans_ev_real    :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above):',time_evp_back+time_evp_solve+time_evp_fwd
-   if(write_to_file) then
-      if (myid == 0) then
-         open(17,file="EVs_real_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-
-   ! call the c function
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-   if (myid==0) then
-     print *," "
-     print '(a)','| Testing with C-interface ... '
-     print *," "
-   end if
-
-   success = solve_elpa1_real_call_from_c(na, nev, aFromC, na_rows, evFromC, zFromC, na_rows, nblk, &
-                                          na_cols, mpi_comm_rows_fromC, mpi_comm_cols_fromC )
-
-   if (myid==0) then
-     print *," "
-     print '(a)','| C call done... '
-     print *," "
-   end if
-   ! check whether c results are the same
-   checksWrong = 0
-   do j=1,na_cols
-     do i=1,na_rows
-       if (a(i,j) .ne. aFromC(i,j)) then
-         print *,"results for a from Fortran and C are not the same!"
-         print *,i,j,a(i,j),aFromC(i,j)
-         checksWrong = 1
-         cycle
-       endif
-       if (z(i,j) .ne. zFromC(i,j)) then
-         print *,"results for z from Fortran and C are not the same!"
-         print *,i,j,z(i,j),zFromC(i,j)
-         checksWrong = 1
-       endif
-
-     enddo
-   enddo
-
-   ! reduction
-   call mpi_allreduce(checksWrong, checksWrongRecv,1,MPI_INTEGER,MPI_SUM,MPI_COMM_WORLD,mpierr)
-   checksWrong = checksWrongRecv
-
-   if (checksWrong == 0) then
-     if (myid == 0) then
-       print *,' Checks for matrix a and z are ok... '
-     endif
-   endif
-
-   checksWrong = 0
-   do i=1,na
-     if (ev(i) .ne. evFromC(i)) then
-       print *,"results for EV from Fortran and C are not the same!"
-       print *,i,ev(i),evFromC(i)
-       checksWrong = 1
-     endif
-   enddo
-
-   ! reduction
-   call mpi_allreduce(checksWrong, checksWrongRecv,1,MPI_INTEGER,MPI_SUM,MPI_COMM_WORLD,mpierr)
-   checksWrong = checksWrongRecv
-
-   if (checksWrong == 0) then
-     if (myid == 0) then
-       print *,' Checks for EVs are ok... '
-     endif
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   print *," "
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-   print *," "
-   print *,"End timings program"
-#endif
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-
-   call EXIT(STATUS)
-
-
-end
-
-!-------------------------------------------------------------------------------
diff --git a/test/shared_sources/blacs_infrastructure.F90 b/test/shared_sources/blacs_infrastructure.F90
deleted file mode 100644
index a8aa1318d..000000000
--- a/test/shared_sources/blacs_infrastructure.F90
+++ /dev/null
@@ -1,148 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-module mod_blacs_infrastructure
-
-  contains
-
-    subroutine set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, &
-                                np_cols, nprow, npcol, my_prow, my_pcol)
-
-      use precision
-
-      implicit none
-      integer(kind=ik), intent(in)     :: mpi_comm_world
-      integer(kind=ik), intent(inout)  :: my_blacs_ctxt, np_rows, &
-                                          np_cols, nprow, npcol, my_prow, my_pcol
-
-      my_blacs_ctxt = mpi_comm_world
-      call BLACS_Gridinit(my_blacs_ctxt, 'C', np_rows, np_cols)
-      call BLACS_Gridinfo(my_blacs_ctxt, nprow, npcol, my_prow, my_pcol)
-    end subroutine
-
-    !c> void set_up_blacsgrid_from_fortran(int mpi_comm_world, int* my_blacs_ctxt,
-    !c>                                    int *np_rows, int *np_cols, int *nprow, int *npcol,
-    !c>                                    int *my_prow, int *my_pcol);
-    subroutine set_up_blacsgrid_wrapper(mpi_comm_world, my_blacs_ctxt, np_rows, &
-                                np_cols, nprow, npcol, my_prow, my_pcol)        &
-                                bind(C, name="set_up_blacsgrid_from_fortran")
-      use iso_c_binding
-      implicit none
-      integer(kind=c_int), value :: mpi_comm_world
-      integer(kind=c_int)        :: my_blacs_ctxt, np_rows, &
-                                    np_cols, nprow, npcol, my_prow, my_pcol
-
-      call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, &
-                                np_cols, nprow, npcol, my_prow, my_pcol)
-    end subroutine
-
-    subroutine set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, &
-                                       np_rows, np_cols, na_rows,  &
-                                       na_cols, sc_desc, my_blacs_ctxt, info)
-
-      use elpa_utilities, only : error_unit
-      use precision
-
-      implicit none
-      include "mpif.h"
-
-      integer(kind=ik), intent(inout)  :: na, nblk, my_prow, my_pcol, np_rows,   &
-                                          np_cols, na_rows, na_cols, sc_desc(1:9), &
-                                          my_blacs_ctxt, info
-
-      integer, external                :: numroc
-      integer(kind=ik)                 :: mpierr
-
-      ! determine the neccessary size of the distributed matrices,
-      ! we use the scalapack tools routine NUMROC
-
-      na_rows = numroc(na, nblk, my_prow, 0, np_rows)
-      na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
-
-      ! set up the scalapack descriptor for the checks below
-      ! For ELPA the following restrictions hold:
-      ! - block sizes in both directions must be identical (args 4 a. 5)
-      ! - first row and column of the distributed matrix must be on
-      !   row/col 0/0 (arg 6 and 7)
-
-      call descinit(sc_desc, na, na, nblk, nblk, 0, 0, my_blacs_ctxt, na_rows, info)
-
-      if (info .ne. 0) then
-        write(error_unit,*) 'Error in BLACS descinit! info=',info
-        write(error_unit,*) 'Most likely this happend since you want to use'
-        write(error_unit,*) 'more MPI tasks than are possible for your'
-        write(error_unit,*) 'problem size (matrix size and blocksize)!'
-        write(error_unit,*) 'The blacsgrid can not be set up properly'
-        write(error_unit,*) 'Try reducing the number of MPI tasks...'
-        call MPI_ABORT(mpi_comm_world, 1, mpierr)
-      endif
-
-    end subroutine
-
-    !c> void set_up_blacs_descriptor_from_fortran(int na, int nblk, int my_prow, int my_pcol,
-    !c>                                           int np_rows, int np_cols,
-    !c>                                           int *na_rows, int *na_cols,
-    !c>                                           int sc_desc[9],
-    !c>                                           int my_blacs_ctxt,
-    !c>                                           int *info);
-    subroutine set_up_blacs_descriptor_wrapper(na, nblk, my_prow, my_pcol, &
-                                               np_rows, np_cols, na_rows,  &
-                                               na_cols, sc_desc,           &
-                                               my_blacs_ctxt, info)        &
-                                               bind(C, name="set_up_blacs_descriptor_from_fortran")
-
-      use iso_c_binding
-      implicit none
-
-
-      integer(kind=c_int), value :: na, nblk, my_prow, my_pcol, np_rows, &
-                                    np_cols, my_blacs_ctxt
-      integer(kind=c_int)        :: na_rows, na_cols, info, sc_desc(1:9)
-
-      call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, &
-                                   np_rows, np_cols, na_rows,  &
-                                   na_cols, sc_desc, my_blacs_ctxt, info)
-
-
-    end subroutine
-
-end module
diff --git a/test/shared_sources/call_elpa1.c b/test/shared_sources/call_elpa1.c
deleted file mode 100644
index 5de630467..000000000
--- a/test/shared_sources/call_elpa1.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*     This file is part of ELPA. */
-/*  */
-/*     The ELPA library was originally created by the ELPA consortium, */
-/*     consisting of the following organizations: */
-/*  */
-/*     - Max Planck Computing and Data Facility (MPCDF), formerly known as */
-/*       Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), */
-/*     - Bergische Universität Wuppertal, Lehrstuhl für angewandte */
-/*       Informatik, */
-/*     - Technische Universität München, Lehrstuhl für Informatik mit */
-/*       Schwerpunkt Wissenschaftliches Rechnen , */
-/*     - Fritz-Haber-Institut, Berlin, Abt. Theorie, */
-/*     - Max-Plack-Institut für Mathematik in den Naturwissenschaften, */
-/*       Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, */
-/*       and */
-/*     - IBM Deutschland GmbH */
-/*  */
-/*  */
-/*     More information can be found here: */
-/*     http://elpa.mpcdf.mpg.de/ */
-/*  */
-/*     ELPA is free software: you can redistribute it and/or modify */
-/*     it under the terms of the version 3 of the license of the */
-/*     GNU Lesser General Public License as published by the Free */
-/*     Software Foundation. */
-/*  */
-/*     ELPA is distributed in the hope that it will be useful, */
-/*     but WITHOUT ANY WARRANTY; without even the implied warranty of */
-/*     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the */
-/*     GNU Lesser General Public License for more details. */
-/*  */
-/*     You should have received a copy of the GNU Lesser General Public License */
-/*     along with ELPA.  If not, see <http://www.gnu.org/licenses/> */
-/*  */
-/*     ELPA reflects a substantial effort on the part of the original */
-/*     ELPA consortium, and we ask you to respect the spirit of the */
-/*     license that we chose: i.e., please contribute any changes you */
-/*     may have back to the original ELPA library distribution, and keep */
-/*     any derivatives of ELPA under the same license that we chose for */
-/*     the original distribution, the GNU Lesser General Public License. */
-/*  */
-/*  */
-#include <string.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <elpa/elpa.h>
-#include <complex.h>
-
-int call_elpa1_real_solver_from_c(int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int ncols, int mpi_comm_rows, int mpi_comm_cols) {
-  return elpa_solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, ncols, mpi_comm_rows, mpi_comm_cols);
-}
-
-int call_elpa1_complex_solver_from_c(int na, int nev, complex double *a, int lda, double *ev, complex double *q, int ldq, int nblk, int ncols, int mpi_comm_rows, int mpi_comm_cols) {
-  return elpa_solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, ncols, mpi_comm_rows, mpi_comm_cols);
-}
-
-int call_elpa_get_comm_from_c(int mpi_comm_world, int my_prow, int my_pcol, int *mpi_comm_rows, int *mpi_comm_cols) {
-  return elpa_get_communicators(mpi_comm_world, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols);
-}
diff --git a/test/shared_sources/call_elpa2.c b/test/shared_sources/call_elpa2.c
deleted file mode 100644
index 66206c0a1..000000000
--- a/test/shared_sources/call_elpa2.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*     This file is part of ELPA. */
-/*  */
-/*     The ELPA library was originally created by the ELPA consortium, */
-/*     consisting of the following organizations: */
-/*  */
-/*     - Max Planck Computing and Data Facility (MPCDF), formerly known as */
-/*       Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), */
-/*     - Bergische Universität Wuppertal, Lehrstuhl für angewandte */
-/*       Informatik, */
-/*     - Technische Universität München, Lehrstuhl für Informatik mit */
-/*       Schwerpunkt Wissenschaftliches Rechnen , */
-/*     - Fritz-Haber-Institut, Berlin, Abt. Theorie, */
-/*     - Max-Plack-Institut für Mathematik in den Naturwissenschaften, */
-/*       Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, */
-/*       and */
-/*     - IBM Deutschland GmbH */
-/*  */
-/*  */
-/*     More information can be found here: */
-/*     http://elpa.mpcdf.mpg.de/ */
-/*  */
-/*     ELPA is free software: you can redistribute it and/or modify */
-/*     it under the terms of the version 3 of the license of the */
-/*     GNU Lesser General Public License as published by the Free */
-/*     Software Foundation. */
-/*  */
-/*     ELPA is distributed in the hope that it will be useful, */
-/*     but WITHOUT ANY WARRANTY; without even the implied warranty of */
-/*     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the */
-/*     GNU Lesser General Public License for more details. */
-/*  */
-/*     You should have received a copy of the GNU Lesser General Public License */
-/*     along with ELPA.  If not, see <http://www.gnu.org/licenses/> */
-/*  */
-/*     ELPA reflects a substantial effort on the part of the original */
-/*     ELPA consortium, and we ask you to respect the spirit of the */
-/*     license that we chose: i.e., please contribute any changes you */
-/*     may have back to the original ELPA library distribution, and keep */
-/*     any derivatives of ELPA under the same license that we chose for */
-/*     the original distribution, the GNU Lesser General Public License. */
-/*  */
-/*  */
-#include <string.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <elpa/elpa.h>
-#include <complex.h>
-
-int call_elpa1_real_solver_from_c(int na, int nev, int ncols, double *a, int lda, double *ev, double *q, int ldq, int nblk, int mpi_comm_rows, int mpi_comm_cols) {
-  int result;
-  result = elpa_solve_evp_real_1stage(na, nev, ncols, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols);
-  return result;
-}
-
-int call_elpa1_complex_solver_from_c(int na, int nev, int ncols, complex double *a, int lda, double *ev, complex double *q, int ldq, int nblk, int mpi_comm_rows, int mpi_comm_cols) {
-  int result;
-  result = elpa_solve_evp_complex_1stage(na, nev, ncols, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols);
-  return result;
-}
-
-int call_elpa_get_comm_from_c(int mpi_comm_world, int my_prow, int my_pcol, int *mpi_comm_rows, int *mpi_comm_cols){
-  int mpierr;
-
-  mpierr = elpa_get_communicators(mpi_comm_world, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols);
-
-  return mpierr;
-}
-
diff --git a/test/shared_sources/check_correctnes.F90 b/test/shared_sources/check_correctnes.F90
deleted file mode 100644
index 3d94a5057..000000000
--- a/test/shared_sources/check_correctnes.F90
+++ /dev/null
@@ -1,251 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-
-module mod_check_correctness
-
-
-  interface check_correctness
-    module procedure check_correctness_complex
-    module procedure check_correctness_real
-  end interface
-
-  contains
-
-    function check_correctness_complex(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2) result(status)
-
-!      use mpi
-      use precision
-      implicit none
-      include 'mpif.h'
-      integer(kind=ik)                 :: status
-      integer(kind=ik), intent(in)     :: na, nev, myid
-      complex(kind=ck), intent(in)     :: as(:,:), z(:,:)
-      complex(kind=ck), intent(inout)  :: tmp1(:,:), tmp2(:,:)
-      real(kind=rk)                    :: ev(:)
-      complex(kind=ck)                 :: xc
-      integer(kind=ik)                 :: sc_desc(:), mpierr
-      complex(kind=ck), parameter      :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
-      integer(kind=ik)                 :: i
-      real(kind=rk)                    :: err, errmax
-
-      status = 0
-
-      ! 1. Residual (maximum of || A*Zi - Zi*EVi ||)
-      ! tmp1 =  A * Z
-      ! as is original stored matrix, Z are the EVs
-      call pzgemm('N','N',na,nev,na,CONE,as,1,1,sc_desc, &
-                  z,1,1,sc_desc,CZERO,tmp1,1,1,sc_desc)
-
-      ! tmp2 = Zi*EVi
-      tmp2(:,:) = z(:,:)
-      do i=1,nev
-        xc = ev(i)
-        call pzscal(na,xc,tmp2,1,i,sc_desc,1)
-      enddo
-
-      !  tmp1 = A*Zi - Zi*EVi
-      tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
-
-      ! Get maximum norm of columns of tmp1
-      errmax = 0.0
-      do i=1,nev
-        xc = 0
-        call pzdotc(na,xc,tmp1,1,i,sc_desc,1,tmp1,1,i,sc_desc,1)
-        errmax = max(errmax, sqrt(real(xc,8)))
-      enddo
-
-      ! Get maximum error norm over all processors
-      err = errmax
-
-      call mpi_allreduce(err,errmax,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,mpierr)
-      if (myid==0) print *
-      if (myid==0) print *,'Error Residual     :',errmax
-
-      if (errmax .gt. 5e-12) then
-        status = 1
-      endif
-
-      ! 2. Eigenvector orthogonality
-
-      ! tmp1 = Z**T * Z
-      tmp1 = 0
-      call pzgemm('C','N',nev,nev,na,CONE,z,1,1,sc_desc, &
-                  z,1,1,sc_desc,CZERO,tmp1,1,1,sc_desc)
-
-      ! Initialize tmp2 to unit matrix
-      tmp2 = 0
-      call pzlaset('A',nev,nev,CZERO,CONE,tmp2,1,1,sc_desc)
-
-      ! tmp1 = Z**T * Z - Unit Matrix
-      tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
-
-      ! Get maximum error (max abs value in tmp1)
-      err = maxval(abs(tmp1))
-      call mpi_allreduce(err,errmax,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,mpierr)
-      if (myid==0) print *,'Error Orthogonality:',errmax
-
-      if (errmax .gt. 5e-12) then
-        status = 1
-      endif
-    end function
-
-    function check_correctness_real(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2) result(status)
-
-!      use mpi
-      use precision
-      implicit none
-      include 'mpif.h'
-      integer(kind=ik)               :: status
-      integer(kind=ik), intent(in)   :: na, nev, myid
-      real(kind=rk), intent(in)      :: as(:,:), z(:,:)
-      real(kind=rk), intent(inout)   :: tmp1(:,:), tmp2(:,:)
-      real(kind=rk)                  :: ev(:)
-      integer(kind=ik)               :: sc_desc(:), mpierr
-
-      integer(kind=ik)               :: i
-      real(kind=rk)                  :: err, errmax
-
-      status = 0
-
-      ! 1. Residual (maximum of || A*Zi - Zi*EVi ||)
-      ! tmp1 =  A * Z
-      call pdgemm('N','N',na,nev,na,1.d0,as,1,1,sc_desc, &
-                  z,1,1,sc_desc,0.d0,tmp1,1,1,sc_desc)
-
-      ! tmp2 = Zi*EVi
-      tmp2(:,:) = z(:,:)
-      do i=1,nev
-        call pdscal(na,ev(i),tmp2,1,i,sc_desc,1)
-      enddo
-
-      !  tmp1 = A*Zi - Zi*EVi
-      tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
-
-      ! Get maximum norm of columns of tmp1
-      errmax = 0.0
-      do i=1,nev
-        err = 0.0
-        call pdnrm2(na,err,tmp1,1,i,sc_desc,1)
-        errmax = max(errmax, err)
-      enddo
-
-      ! Get maximum error norm over all processors
-      err = errmax
-
-      call mpi_allreduce(err,errmax,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,mpierr)
-      if (myid==0) print *
-      if (myid==0) print *,'Error Residual     :',errmax
-
-      if (errmax .gt. 5e-12) then
-        status = 1
-      endif
-
-      ! 2. Eigenvector orthogonality
-
-      ! tmp1 = Z**T * Z
-      tmp1 = 0
-      call pdgemm('T','N',nev,nev,na,1.d0,z,1,1,sc_desc, &
-                  z,1,1,sc_desc,0.d0,tmp1,1,1,sc_desc)
-
-      ! Initialize tmp2 to unit matrix
-      tmp2 = 0
-      call pdlaset('A',nev,nev,0.d0,1.d0,tmp2,1,1,sc_desc)
-
-      ! tmp1 = Z**T * Z - Unit Matrix
-      tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
-
-      ! Get maximum error (max abs value in tmp1)
-      err = maxval(abs(tmp1))
-      call mpi_allreduce(err,errmax,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,mpierr)
-      if (myid==0) print *,'Error Orthogonality:',errmax
-
-      if (errmax .gt. 5e-12) then
-        status = 1
-      endif
-    end function
-
-    !c> int check_correctness_real_from_fortran(int na, int nev, int na_rows, int na_cols,
-    !c>                                         double *as, double *z, double *ev,
-    !c>                                         int sc_desc[9], int myid,
-    !c>                                         double *tmp1, double *tmp2);
-    function check_correctness_real_wrapper(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2) result(status) &
-      bind(C,name="check_correctness_real_from_fortran")
-
-      use iso_c_binding
-
-      implicit none
-
-      integer(kind=c_int)         :: status
-      integer(kind=c_int), value  :: na, nev, myid, na_rows, na_cols
-      real(kind=c_double)         :: as(1:na_rows,1:na_cols), z(1:na_rows,1:na_cols)
-      real(kind=c_double)         :: tmp1(1:na_rows,1:na_cols), tmp2(1:na_rows,1:na_cols)
-      real(kind=c_double)         :: ev(1:na)
-      integer(kind=c_int)         :: sc_desc(1:9)
-
-      status = check_correctness_real(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-    end function
-    !c> int check_correctness_complex_from_fortran(int na, int nev, int na_rows, int na_cols,
-    !c>                                         complex double *as, complex double *z, double *ev,
-    !c>                                         int sc_desc[9], int myid,
-    !c>                                         complex double *tmp1, complex double *tmp2);
-    function check_correctness_complex_wrapper(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2) result(status) &
-      bind(C,name="check_correctness_complex_from_fortran")
-
-      use iso_c_binding
-
-      implicit none
-
-      integer(kind=c_int)         :: status
-      integer(kind=c_int), value  :: na, nev, myid, na_rows, na_cols
-      complex(kind=c_double)      :: as(1:na_rows,1:na_cols), z(1:na_rows,1:na_cols)
-      complex(kind=c_double)      :: tmp1(1:na_rows,1:na_cols), tmp2(1:na_rows,1:na_cols)
-      real(kind=c_double)         :: ev(1:na)
-      integer(kind=c_int)         :: sc_desc(1:9)
-
-      status = check_correctness_complex(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-    end function
-
-end module mod_check_correctness
diff --git a/test/shared_sources/mod_from_c.F90 b/test/shared_sources/mod_from_c.F90
deleted file mode 100644
index 0616ee92c..000000000
--- a/test/shared_sources/mod_from_c.F90
+++ /dev/null
@@ -1,160 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-module from_c
-  implicit none
-
-  public
-
-  interface
-    integer(kind=c_int) function elpa1_real_c(na, nev,  a, lda, ev, q, ldq,         &
-                                       nblk, matrixCols, mpi_comm_rows, mpi_comm_cols ) &
-                                       bind(C, name="call_elpa1_real_solver_from_c")
-
-      use iso_c_binding
-      implicit none
-
-      integer(kind=c_int), value :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-      real(kind=c_double)        :: a(1:lda,1:matrixCOls), ev(1:na), q(1:ldq,1:matrixCols)
-    end function elpa1_real_c
-
-
-  end interface
-
-  interface
-    integer(kind=c_int) function elpa1_complex_c(na, nev,  a, lda, ev, q, ldq,         &
-                                       nblk, matrixCols, mpi_comm_rows, mpi_comm_cols ) &
-                                       bind(C, name="call_elpa1_complex_solver_from_c")
-
-      use iso_c_binding
-      implicit none
-
-      integer(kind=c_int), value  :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-      real(kind=c_double)         :: ev(1:na)
-      complex(kind=c_double)      :: a(1:lda,1:matrixCOls), q(1:ldq,1:matrixCols)
-
-    end function elpa1_complex_c
-
-
-  end interface
-
-  interface
-    integer(kind=c_int) function elpa_get_comm_c(mpi_comm_world, my_prow, my_pcol, &
-                                                 mpi_comm_rows, mpi_comm_cols)     &
-                                                 bind(C, name="call_elpa_get_comm_from_c")
-      use iso_c_binding
-      implicit none
-      integer(kind=c_int), value :: mpi_comm_world, my_prow, my_pcol
-      integer(kind=c_int)        :: mpi_comm_rows, mpi_comm_cols
-
-    end function
-  end interface
-
-  contains
-
-  function solve_elpa1_real_call_from_c(na, nev, a, lda, ev, q, ldq,         &
-                      nblk, matrixCOls, mpi_comm_rows, mpi_comm_cols ) &
-                      result(success)
-    use precision
-    use iso_c_binding
-    implicit none
-
-    integer(kind=ik) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-    logical          :: success
-    integer(kind=ik) :: successC
-
-    real(kind=c_double)  :: a(1:lda,1:matrixCols), ev(1:na), q(1:ldq,1:matrixCols)
-
-    successC = elpa1_real_c(na, nev, a, lda, ev, q, ldq, nblk, &
-                            matrixCols, mpi_comm_rows, mpi_comm_cols)
-
-    if (successC .eq. 1) then
-      success = .true.
-    else
-      success = .false.
-    endif
-
-  end function
-
-  function solve_elpa1_complex_call_from_c(na, nev, a, lda, ev, q, ldq,         &
-                      nblk, matrixCOls, mpi_comm_rows, mpi_comm_cols ) &
-                      result(success)
-
-    use precision
-    use iso_c_binding
-    implicit none
-
-    integer(kind=ik) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-    logical          :: success
-    integer(kind=ik) :: successC
-
-    real(kind=c_double)    :: ev(1:na)
-    complex(kind=c_double) :: a(1:lda,1:matrixCols), q(1:ldq,1:matrixCols)
-
-
-    successC = elpa1_complex_c(na, nev, a, lda, ev, q, ldq, nblk, &
-                            matrixCols, mpi_comm_rows, mpi_comm_cols)
-
-    if (successC .eq. 1) then
-      success = .true.
-    else
-      success = .false.
-    endif
-
-  end function
-
-
-  function call_elpa_get_comm_from_c(mpi_comm_world, my_prow, my_pcol, &
-                                     mpi_comm_rows, mpi_comm_cols) result(mpierr)
-
-      use precision
-      use iso_c_binding
-      implicit none
-
-      integer(kind=ik) :: mpierr
-      integer(kind=ik) :: mpi_comm_world, my_prow, my_pcol, &
-                          mpi_comm_rows, mpi_comm_cols
-
-      mpierr = elpa_get_comm_c(mpi_comm_world, my_prow, my_pcol, &
-                                    mpi_comm_rows, mpi_comm_cols)
-  end function
-end module from_c
diff --git a/test/shared_sources/prepare_matrix.F90 b/test/shared_sources/prepare_matrix.F90
deleted file mode 100644
index e9a441962..000000000
--- a/test/shared_sources/prepare_matrix.F90
+++ /dev/null
@@ -1,168 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-module mod_prepare_matrix
-
-  interface prepare_matrix
-    module procedure prepare_matrix_complex
-    module procedure prepare_matrix_real
-  end interface
-
-  contains
-
-    subroutine prepare_matrix_complex(na, myid, sc_desc, iseed, xr, a, z, as)
-
-      use precision
-      implicit none
-
-      integer(kind=ik), intent(in)    :: myid, na, sc_desc(:)
-      integer(kind=ik), intent(inout) :: iseed(:)
-      real(kind=rk), intent(inout)    :: xr(:,:)
-      complex(kind=ck), intent(inout) :: z(:,:), a(:,:), as(:,:)
-
-      complex(kind=ck), parameter     :: CZERO = (0.d0, 0.d0), CONE = (1.d0, 0.d0)
-
-      ! for getting a hermitian test matrix A we get a random matrix Z
-      ! and calculate A = Z + Z**H
-
-      ! we want different random numbers on every process
-      ! (otherwise A might get rank deficient):
-
-      iseed(:) = myid
-      call RANDOM_SEED(put=iseed)
-      call RANDOM_NUMBER(xr)
-      z(:,:) = xr(:,:)
-      call RANDOM_NUMBER(xr)
-      z(:,:) = z(:,:) + (0.d0,1.d0)*xr(:,:)
-
-      a(:,:) = z(:,:)
-
-      if (myid == 0) then
-        print '(a)','| Random matrix block has been set up. (only processor 0 confirms this step)'
-      endif
-
-      call pztranc(na, na, CONE, z, 1, 1, sc_desc, CONE, a, 1, 1, sc_desc) ! A = A + Z**H
-
-      if (myid == 0) then
-        print '(a)','| Random matrix block has been symmetrized'
-      endif
-
-      ! save original matrix A for later accuracy checks
-
-      as = a
-
-    end subroutine
-
-    subroutine prepare_matrix_real(na, myid, sc_desc, iseed, a, z, as)
-
-      use precision
-      implicit none
-
-      integer(kind=ik), intent(in)     :: myid, na, sc_desc(:)
-      integer(kind=ik), intent(inout)  :: iseed(:)
-      real(kind=ck), intent(inout)     :: z(:,:), a(:,:), as(:,:)
-
-      ! for getting a hermitian test matrix A we get a random matrix Z
-      ! and calculate A = Z + Z**H
-
-      ! we want different random numbers on every process
-      ! (otherwise A might get rank deficient):
-
-      iseed(:) = myid
-      call RANDOM_SEED(put=iseed)
-      call RANDOM_NUMBER(z)
-
-      a(:,:) = z(:,:)
-
-      if (myid == 0) then
-        print '(a)','| Random matrix block has been set up. (only processor 0 confirms this step)'
-      endif
-
-      call pdtran(na, na, 1.d0, z, 1, 1, sc_desc, 1.d0, a, 1, 1, sc_desc) ! A = A + Z**T
-
-      if (myid == 0) then
-        print '(a)','| Random matrix block has been symmetrized'
-      endif
-
-      ! save original matrix A for later accuracy checks
-
-      as = a
-
-    end subroutine
-
-    !c> void prepare_matrix_real_from_fortran(int na, int myid, int na_rows, int na_cols,
-    !c>                                       int sc_desc[9], int iseed[4096],
-    !c>                                       double *a, double *z, double *as);
-    subroutine prepare_matrix_real_wrapper(na, myid, na_rows, na_cols, sc_desc, iseed, a, z, as) &
-                                          bind(C, name="prepare_matrix_real_from_fortran")
-      use iso_c_binding
-
-      implicit none
-
-      integer(kind=c_int) , value   :: myid, na, na_rows, na_cols
-      integer(kind=c_int)           :: sc_desc(1:9)
-      integer(kind=c_int)           :: iseed(1:4096)
-      real(kind=c_double)           :: z(1:na_rows,1:na_cols), a(1:na_rows,1:na_cols),  &
-                                       as(1:na_rows,1:na_cols)
-
-      call prepare_matrix_real(na, myid, sc_desc, iseed, a, z, as)
-    end subroutine
-    !c> void prepare_matrix_complex_from_fortran(int na, int myid, int na_rows, int na_cols,
-    !c>                                       int sc_desc[9], int iseed[4096], double *xr,
-    !c>                                       complex double *a, complex double *z, complex double *as);
-    subroutine prepare_matrix_complex_wrapper(na, myid, na_rows, na_cols, sc_desc, iseed, xr, a, z, as) &
-                                          bind(C, name="prepare_matrix_complex_from_fortran")
-      use iso_c_binding
-
-      implicit none
-
-      integer(kind=c_int) , value   :: myid, na, na_rows, na_cols
-      integer(kind=c_int)           :: sc_desc(1:9)
-      integer(kind=c_int)           :: iseed(1:4096)
-      real(kind=c_double)           :: xr(1:na_rows,1:na_cols)
-      complex(kind=c_double)        :: z(1:na_rows,1:na_cols), a(1:na_rows,1:na_cols),  &
-                                       as(1:na_rows,1:na_cols)
-
-      call prepare_matrix_complex(na, myid, sc_desc, iseed, xr, a, z, as)
-    end subroutine
-
-end module
diff --git a/test/shared_sources/read_input_parameters.F90 b/test/shared_sources/read_input_parameters.F90
deleted file mode 100644
index dd0307f70..000000000
--- a/test/shared_sources/read_input_parameters.F90
+++ /dev/null
@@ -1,103 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-module mod_read_input_parameters
-
-  contains
-
-    subroutine read_input_parameters(na, nev, nblk, write_to_file)
-      use ELPA_utilities, only : error_unit
-      use precision
-      implicit none
-      include 'mpif.h'
-
-      integer(kind=ik), intent(out) :: na, nev, nblk
-      logical, intent(out)          :: write_to_file
-
-      ! Command line arguments
-      character(len=128)            :: arg1, arg2, arg3, arg4
-      integer(kind=ik)              :: mpierr
-
-      ! default parameters
-      na = 4000
-      nev = 1500
-      nblk = 16
-      write_to_file = .false.
-
-      if (.not. any(COMMAND_ARGUMENT_COUNT() == [0, 3, 4])) then
-        write(error_unit, '(a,i0,a)') "Invalid number (", COMMAND_ARGUMENT_COUNT(), ") of command line arguments!"
-        write(error_unit, *) "Expected: program [ [matrix_size num_eigenvalues block_size] ""output""]"
-        stop 1
-      endif
-
-      if (COMMAND_ARGUMENT_COUNT() == 3) then
-        call GET_COMMAND_ARGUMENT(1, arg1)
-        call GET_COMMAND_ARGUMENT(2, arg2)
-        call GET_COMMAND_ARGUMENT(3, arg3)
-
-        read(arg1, *) na
-        read(arg2, *) nev
-        read(arg3, *) nblk
-      endif
-
-      if (COMMAND_ARGUMENT_COUNT() == 4) then
-        call GET_COMMAND_ARGUMENT(1, arg1)
-        call GET_COMMAND_ARGUMENT(2, arg2)
-        call GET_COMMAND_ARGUMENT(3, arg3)
-        call GET_COMMAND_ARGUMENT(4, arg4)
-        read(arg1, *) na
-        read(arg2, *) nev
-        read(arg3, *) nblk
-
-        if (arg4 .eq. "output") then
-          write_to_file = .true.
-        else
-          write(error_unit, *) "Invalid value for output flag! Must be ""output"" or omitted"
-          stop 1
-        endif
-
-      endif
-
-    end subroutine
-
-end module
diff --git a/test/shared_sources/redir.c b/test/shared_sources/redir.c
deleted file mode 100644
index 66a217c4c..000000000
--- a/test/shared_sources/redir.c
+++ /dev/null
@@ -1,125 +0,0 @@
-//    This file is part of ELPA.
-//
-//    The ELPA library was originally created by the ELPA consortium,
-//    consisting of the following organizations:
-//
-//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-//      Informatik,
-//    - Technische Universität München, Lehrstuhl für Informatik mit
-//      Schwerpunkt Wissenschaftliches Rechnen ,
-//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-//      and
-//    - IBM Deutschland GmbH
-//
-//
-//    More information can be found here:
-//    http://elpa.mpcdf.mpg.de/
-//
-//    ELPA is free software: you can redistribute it and/or modify
-//    it under the terms of the version 3 of the license of the
-//    GNU Lesser General Public License as published by the Free
-//    Software Foundation.
-//
-//    ELPA is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU Lesser General Public License for more details.
-//
-//    You should have received a copy of the GNU Lesser General Public License
-//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-//
-//    ELPA reflects a substantial effort on the part of the original
-//    ELPA consortium, and we ask you to respect the spirit of the
-//    license that we chose: i.e., please contribute any changes you
-//    may have back to the original ELPA library distribution, and keep
-//    any derivatives of ELPA under the same license that we chose for
-//    the original distribution, the GNU Lesser General Public License.
-//
-//
-// --------------------------------------------------------------------------------------------------
-#include <stdio.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <errno.h>
-
-#define NAME_LENGTH 4096
-#define FILENAME "./mpi_stdout/std%3s_rank%04d.txt"
-
-FILE *tout, *terr;
-void dup_filename(char *filename, int dupfd);
-void dup_fd(int fd, int dupfd);
-
-int _mkdirifnotexists(const char *dir) {
-    struct stat s;
-    if (stat(dir, &s) != 0) {
-        if (errno == ENOENT) {
-            if (mkdir(dir, 0755) != 0) {
-                perror("mkdir");
-                return 0;
-            } else {
-                return 1;
-            }
-        } else {
-            perror("stat()");
-	    return 0;
-        }
-    } else if (!S_ISDIR(s.st_mode)) {
-        fprintf(stderr, "\"%s\" does exist and is not a directory\n", dir);
-        return 0;
-    } else {
-        return 1;
-    }
-}
-
-int create_directories(void) {
-    if (!_mkdirifnotexists("mpi_stdout")) return 0;
-    return 1;
-}
-
-void redirect_stdout(int *myproc) {
-  char buf[NAME_LENGTH];
-
-  if (*myproc == 0) {
-    snprintf(buf, NAME_LENGTH, "tee " FILENAME, "out", *myproc);
-    tout = popen(buf, "w");
-    dup_fd(fileno(tout), 1);
-
-    snprintf(buf, NAME_LENGTH, "tee " FILENAME, "err", *myproc);
-    terr = popen(buf, "w");
-    dup_fd(fileno(terr), 2);
-  } else {
-    snprintf(buf, NAME_LENGTH, FILENAME, "out", *myproc);
-    dup_filename(buf, 1);
-
-    snprintf(buf, NAME_LENGTH, FILENAME, "err", *myproc);
-    dup_filename(buf, 2);
-  }
-
-  return;
-}
-
-/* Redirect file descriptor dupfd to file filename */
-void dup_filename(char *filename, int dupfd) {
-  int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
-  if(fd < 0) {
-    perror("open()");
-    exit(1);
-  }
-  dup_fd(fd, dupfd);
-}
-
-/* Redirect file descriptor dupfd to file descriptor fd */
-void dup_fd(int fd, int dupfd) {
-  if(dup2(fd,dupfd) < 0) {
-    perror("dup2()");
-    exit(1);
-  }
-}
diff --git a/test/shared_sources/redirect.F90 b/test/shared_sources/redirect.F90
deleted file mode 100644
index d0859d4de..000000000
--- a/test/shared_sources/redirect.F90
+++ /dev/null
@@ -1,118 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-
-module redirect
-  use, intrinsic :: iso_c_binding
-
-  implicit none
-  public
-
-  logical :: use_redirect_stdout
-
-  interface
-    integer(kind=C_INT) function create_directories_c() bind(C, name="create_directories")
-      use, intrinsic :: iso_c_binding
-      implicit none
-    end function
-  end interface
-
-  interface
-    subroutine redirect_stdout_c(myproc) bind(C, name="redirect_stdout")
-      use, intrinsic :: iso_c_binding
-      implicit none
-      integer(kind=C_INT), intent(in) :: myproc
-    end subroutine
-  end interface
-
-  contains
-!>
-!> This function is the Fortran driver for the
-!> C program to create the redirect output
-!> directory
-!>
-!> \param none
-!> \result res integer indicates success or failure
-    function create_directories() result(res)
-      use precision
-      implicit none
-      integer(kind=ik) :: res
-      res = int(create_directories_c())
-    end function
-!>
-!> This subroutine is the Fortran driver for the
-!> redirection of stdout and stderr of each MPI
-!> task
-!>
-!> \param myproc MPI task id
-    subroutine redirect_stdout(myproc)
-      use, intrinsic :: iso_c_binding
-      use precision
-      implicit none
-      integer(kind=ik), intent(in) :: myproc
-      call redirect_stdout_c(int(myproc, kind=C_INT))
-    end subroutine
-!>
-!> This function checks, whether the environment variable
-!> "REDIRECT_ELPA_TEST_OUTPUT" is set to "true".
-!> Returns ".true." if variable is set, otherwise ".false."
-!> This function only works if the during the build process
-!> "HAVE_ENVIRONMENT_CHECKING" was tested successfully
-!>
-!> \param none
-!> \return logical
-    function check_redirect_environment_variable() result(redirect)
-      implicit none
-      logical            :: redirect
-      character(len=255) :: REDIRECT_VARIABLE
-
-      redirect = .false.
-
-#if defined(HAVE_ENVIRONMENT_CHECKING)
-      call get_environment_variable("REDIRECT_ELPA_TEST_OUTPUT",REDIRECT_VARIABLE)
-#endif
-      if (trim(REDIRECT_VARIABLE) .eq. "true") redirect = .true.
-
-    end function
-
-end module redirect
diff --git a/test/shared_sources/setup_mpi.F90 b/test/shared_sources/setup_mpi.F90
deleted file mode 100644
index cae2e0bce..000000000
--- a/test/shared_sources/setup_mpi.F90
+++ /dev/null
@@ -1,86 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-module mod_setup_mpi
-
-  contains
-
-    subroutine setup_mpi(myid, nprocs)
-      use test_util
-      use ELPA_utilities
-      use precision
-      implicit none
-      include 'mpif.h'
-
-      integer(kind=ik)              :: mpierr
-
-      integer(kind=ik), intent(out) :: myid, nprocs
-#ifdef WITH_OPENMP
-      integer(kind=ik)              :: required_mpi_thread_level, &
-                                       provided_mpi_thread_level
-
-#endif
-#ifndef WITH_OPENMP
-      call mpi_init(mpierr)
-#else
-      required_mpi_thread_level = MPI_THREAD_MULTIPLE
-
-      call mpi_init_thread(required_mpi_thread_level,     &
-                           provided_mpi_thread_level, mpierr)
-
-      if (required_mpi_thread_level .ne. provided_mpi_thread_level) then
-        write(error_unit,*) "MPI ERROR: MPI_THREAD_MULTIPLE is not provided on this system"
-        write(error_unit,*) "           only ", mpi_thread_level_name(provided_mpi_thread_level), " is available"
-        call exit(77)
-      endif
-
-#endif
-      call mpi_comm_rank(mpi_comm_world,myid,mpierr)
-      call mpi_comm_size(mpi_comm_world,nprocs,mpierr)
-
-
-
-    end subroutine
-
-
-end module mod_setup_mpi
diff --git a/test/shared_sources/util.F90 b/test/shared_sources/util.F90
deleted file mode 100644
index d1499e5af..000000000
--- a/test/shared_sources/util.F90
+++ /dev/null
@@ -1,76 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-module test_util
-  implicit none
-  private
-  public mpi_thread_level_name
-  include 'mpif.h'
-
-  contains
-!>
-!> This function translates, if ELPA was build with OpenMP support,
-!> the found evel of "thread safetiness" from the internal number
-!> of the MPI library into a human understandable value
-!>
-!> \param level thread-saftiness of the MPI library
-!> \return str human understandable value of thread saftiness
-  pure function mpi_thread_level_name(level) result(str)
-    use precision
-    implicit none
-    integer(kind=ik), intent(in) :: level
-    character(len=21)            :: str
-    select case(level)
-      case (MPI_THREAD_SINGLE)
-        str = "MPI_THREAD_SINGLE"
-      case (MPI_THREAD_FUNNELED)
-        str = "MPI_THREAD_FUNNELED"
-      case (MPI_THREAD_SERIALIZED)
-        str = "MPI_THREAD_SERIALIZED"
-      case (MPI_THREAD_MULTIPLE)
-        str = "MPI_THREAD_MULTIPLE"
-      case default
-        write(str,'(i0,1x,a)') level, "(Unknown level)"
-    end select
-  end function
-
-end module
diff --git a/test_project/Makefile.am b/test_project/Makefile.am
deleted file mode 100644
index 22a12d948..000000000
--- a/test_project/Makefile.am
+++ /dev/null
@@ -1,10 +0,0 @@
-## Process this file with automake to produce Makefile.in
-
-ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4
-
-AM_FCFLAGS = @FC_MODINC@modules @FC_MODOUT@modules $(ELPA_FCFLAGS)
-AM_LDFLAGS = $(ELPA_LIBS)
-
-#bindir = $(abs_top_builddir)
-bin_PROGRAMS = test_real
-test_real_SOURCES = src/test_real.F90
diff --git a/test_project/autogen.sh b/test_project/autogen.sh
deleted file mode 120000
index 9f8a4cb7d..000000000
--- a/test_project/autogen.sh
+++ /dev/null
@@ -1 +0,0 @@
-../autogen.sh
\ No newline at end of file
diff --git a/test_project/configure.ac b/test_project/configure.ac
deleted file mode 100644
index 5e5662a8f..000000000
--- a/test_project/configure.ac
+++ /dev/null
@@ -1,93 +0,0 @@
-AC_PREREQ([2.69])
-AC_INIT([elpa_test_project],[2014.06.000], elpa-library@rzg.mpg.de)
-AC_CONFIG_SRCDIR([src/test_real.F90])
-
-AM_INIT_AUTOMAKE([foreign -Wall subdir-objects])
-
-# Without this, automake tries to be smart and rebuilt
-# the autoconf generated files such as configure, aclocal.m4, etc.,
-# in case the timestamps of files such as configure.ac are newer
-#
-# This only makes trouble for end users with out-of-date autoconf versions
-# that cannot produce these files
-AM_MAINTAINER_MODE([disable])
-
-AC_CONFIG_MACRO_DIR([m4])
-AC_CONFIG_HEADERS([config.h])
-AM_SILENT_RULES([yes])
-
-rm -rf config.h config-f90.h
-
-AX_CHECK_GNU_MAKE()
-if test x$_cv_gnu_make_command = x ; then
-        AC_MSG_ERROR([Need GNU Make])
-fi
-
-AC_CHECK_PROG(CPP_FOUND,cpp,yes,no)
-if test x"${CPP_FOUND}" = xno; then
-  AC_MSG_ERROR([no cpp found])
-fi
-
-# gnu-make fortran module dependencies
-m4_include([fdep/fortran_dependencies.m4])
-FDEP_F90_GNU_MAKE_DEPS
-
-AC_PROG_INSTALL
-AM_PROG_CC_C_O
-AM_PROG_AR
-AM_PROG_AS
-
-AC_LANG([Fortran])
-m4_include([m4/ax_prog_fc_mpi.m4])
-
-dnl check whether an mpi compiler is available;
-dnl if not abort since it is mandatory
-AX_PROG_FC_MPI([],[have_mpi=yes],[have_mpi=no
-		 if test x"${have_mpi}" = xno; then
-  		  AC_MSG_ERROR([no mpi found])
-		  fi])
-
-AC_FC_FREEFORM
-AC_FC_MODULE_FLAG
-AC_FC_MODULE_OUTPUT_FLAG
-
-AC_MSG_CHECKING(whether OpenMP usage is specified)
-AC_ARG_WITH([openmp],
-		AS_HELP_STRING([--with-openmp],
-			       [use OpenMP threading, default no.]),
-	      [with_openmp=yes],
-	      [with_openmp=no])
-  AC_MSG_RESULT([${with_openmp}])
-  if test x"${enable_openmp}" = x"yes"; then
-     with_openmp=yes
-     AC_MSG_CHECKING(whether --enable-openmp is specified)
-     AC_MSG_RESULT([${enable_openmp}])
-  fi
-  AM_CONDITIONAL([WITH_OPENMP],[test x"$with_openmp" = x"yes"])
-  if test x"${with_openmp}" = x"yes"; then
-	AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
-        AX_ELPA_OPENMP
-	elpa="elpa_openmp-2014.06.001"
-  else
-	elpa="elpa-2014.06.001"
-  fi
-
-# Here comes the ELPA specific part
-PKG_PROG_PKG_CONFIG
-PKG_CHECK_MODULES([ELPA],[${elpa}],[],[AC_MSG_ERROR(["Need ${elpa}"])])
-PKG_CHECK_VAR([ELPA_FCFLAGS],[${elpa}],[fcflags])
-
-LT_INIT
-
-AC_SUBST([FC_MODINC])
-AC_SUBST([FC_MODOUT])
-
-rm -rf modules/ .fortran_dependencies/
-mkdir modules
-
-AC_CONFIG_FILES([
-  Makefile
-])
-AC_OUTPUT
-
-grep "^#define" config.h > config-f90.h
diff --git a/test_project/fdep b/test_project/fdep
deleted file mode 120000
index 6c88d7b43..000000000
--- a/test_project/fdep
+++ /dev/null
@@ -1 +0,0 @@
-../fdep
\ No newline at end of file
diff --git a/test_project/m4/ax_prog_fc_mpi.m4 b/test_project/m4/ax_prog_fc_mpi.m4
deleted file mode 120000
index cd1756b46..000000000
--- a/test_project/m4/ax_prog_fc_mpi.m4
+++ /dev/null
@@ -1 +0,0 @@
-../../m4/ax_prog_fc_mpi.m4
\ No newline at end of file
diff --git a/test_project/src/test_real.F90 b/test_project/src/test_real.F90
deleted file mode 120000
index 122153610..000000000
--- a/test_project/src/test_real.F90
+++ /dev/null
@@ -1 +0,0 @@
-../../test/test_real.F90
\ No newline at end of file
-- 
GitLab