Commit a74e2253 authored by Andreas Marek's avatar Andreas Marek
Browse files

Move srcs of multiply_a_b to new folder

parent ae584b5a
......@@ -75,6 +75,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/cholesky/mod_elpa_cholesky.F90 \
src/cholesky/mod_cholesky_cuda.F90 \
src/invert_trm/mod_elpa_invert_trm.F90 \
src/multiply_a_b/mod_elpa_multiply_a_b.F90 \
src/elpa1/mod_distribute_global_column.F90 \
src/elpa1/mod_v_add_s.F90 \
src/elpa1/mod_solve_secular_equation.F90 \
......@@ -122,7 +123,7 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/general/error_checking.inc \
src/cholesky/elpa_cholesky_template.F90 \
src/invert_trm/elpa_invert_trm_template.F90 \
src/elpa1/elpa_multiply_a_b.F90 \
src/multiply_a_b/elpa_multiply_a_b_template.F90 \
src/elpa1/elpa_solve_tridi_impl_public.F90 \
src/general/elpa_ssr2_template.F90 \
src/general/elpa_ssmv_template.F90 \
......@@ -886,8 +887,9 @@ EXTRA_DIST = \
src/cholesky/mod_cholesky_cuda.F90 \
src/invert_trm/mod_elpa_invert_trm.F90 \
src/invert_trm/mod_invert_trm_cuda.F90 \
src/multiply_a_b/elpa_multiply_a_b_template.F90 \
src/multiply_a_b/mod_elpa_multiply_a_b.F90 \
src/invert_trm/elpa_invert_trm_template.F90 \
src/elpa1/elpa_multiply_a_b.F90 \
src/elpa1/elpa_reduce_add_vectors.F90 \
src/elpa1/elpa_solve_tridi_impl_public.F90 \
src/elpa1/elpa_transpose_vectors.F90 \
......
......@@ -58,179 +58,12 @@ module elpa1_auxiliary_impl
use elpa_utilities
use elpa_cholesky
use elpa_invert_trm
use elpa_multiply_a_b
implicit none
public :: elpa_mult_at_b_real_double_impl !< Multiply double-precision real matrices A**T * B
public :: elpa_mult_ah_b_complex_double_impl !< Multiply double-precision complex matrices A**H * B
public :: elpa_solve_tridi_double_impl !< Solve tridiagonal eigensystem for a double-precision matrix with divide and conquer method
#ifdef WANT_SINGLE_PRECISION_REAL
public :: elpa_mult_at_b_real_single_impl !< Multiply single-precision real matrices A**T * B
public :: elpa_solve_tridi_single_impl !< Solve tridiagonal eigensystem for a single-precision matrix with divide and conquer method
#endif
#ifdef WANT_SINGLE_PRECISION_COMPLEX
public :: elpa_mult_ah_b_complex_single_impl !< Multiply single-precision complex matrices A**H * B
#endif
contains
#define REALCASE 1
#define DOUBLE_PRECISION
#include "../general/precision_macros.h"
function elpa_mult_at_b_real_double_impl(obj, uplo_a, uplo_c, ncb, a, b, ldb, ldbCols, &
c, ldc, ldcCols) result(success)
#include "elpa_multiply_a_b.F90"
end function elpa_mult_at_b_real_double_impl
#undef DOUBLE_PRECISION
#undef REALCASE
#ifdef WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION
#include "../general/precision_macros.h"
!> \brief elpa_mult_at_b_real_single_impl: Performs C : = A**T * B
!> where A is a square matrix (obj%na,obj%na) which is optionally upper or lower triangular
!> B is a (obj%na,ncb) matrix
!> C is a (obj%na,ncb) matrix where optionally only the upper or lower
!> triangle may be computed
!> \details
!> \param uplo_a 'U' if A is upper triangular
!> 'L' if A is lower triangular
!> anything else if A is a full matrix
!> Please note: This pertains to the original A (as set in the calling program)
!> whereas the transpose of A is used for calculations
!> If uplo_a is 'U' or 'L', the other triangle is not used at all,
!> i.e. it may contain arbitrary numbers
!> \param uplo_c 'U' if only the upper diagonal part of C is needed
!> 'L' if only the upper diagonal part of C is needed
!> anything else if the full matrix C is needed
!> Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
!> written to a certain extent, i.e. one shouldn't rely on the content there!
!> \param na Number of rows/columns of A, number of rows of B and C
!> \param ncb Number of columns of B and C
!> \param a matrix a
!> \param obj%local_nrows leading dimension of matrix a, set with class method obj%set("local_nrows",value)
!> \param b matrix b
!> \param ldb leading dimension of matrix b
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!> \param mpi_comm_rows MPI communicator for rows
!> \param mpi_comm_cols MPI communicator for columns
!> \param c matrix c
!> \param ldc leading dimension of matrix c
!> \result success
function elpa_mult_at_b_real_single_impl(obj, uplo_a, uplo_c, ncb, a, b, ldb, ldbCols, &
c, ldc, ldcCols) result(success)
#include "elpa_multiply_a_b.F90"
end function elpa_mult_at_b_real_single_impl
#undef SINGLE_PRECISION
#undef REALCASE
#endif /* WANT_SINGLE_PRECISION_REAL */
#define COMPLEXCASE 1
#define DOUBLE_PRECISION
#include "../general/precision_macros.h"
!> \brief elpa_mult_ah_b_complex_double_impl: Performs C : = A**H * B
!> where A is a square matrix (obj%na,obj%na) which is optionally upper or lower triangular
!> B is a (obj%na,ncb) matrix
!> C is a (obj%na,ncb) matrix where optionally only the upper or lower
!> triangle may be computed
!> \details
!>
!> \param uplo_a 'U' if A is upper triangular
!> 'L' if A is lower triangular
!> anything else if A is a full matrix
!> Please note: This pertains to the original A (as set in the calling program)
!> whereas the transpose of A is used for calculations
!> If uplo_a is 'U' or 'L', the other triangle is not used at all,
!> i.e. it may contain arbitrary numbers
!> \param uplo_c 'U' if only the upper diagonal part of C is needed
!> 'L' if only the upper diagonal part of C is needed
!> anything else if the full matrix C is needed
!> Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
!> written to a certain extent, i.e. one shouldn't rely on the content there!
!> \param na Number of rows/columns of A, number of rows of B and C
!> \param ncb Number of columns of B and C
!> \param a matrix a
!> \param obj%local_ncols leading dimension of matrix a, set with class method obj%set("local_nrows",value)
!> \param ldaCols columns of matrix a
!> \param b matrix b
!> \param ldb leading dimension of matrix b
!> \param ldbCols columns of matrix b
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!> \param mpi_comm_rows MPI communicator for rows
!> \param mpi_comm_cols MPI communicator for columns
!> \param c matrix c
!> \param ldc leading dimension of matrix c
!> \result success
function elpa_mult_ah_b_complex_double_impl(obj, uplo_a, uplo_c, ncb, a, b, ldb, ldbCols, &
c, ldc, ldcCols) result(success)
#include "elpa_multiply_a_b.F90"
end function elpa_mult_ah_b_complex_double_impl
#undef DOUBLE_PRECISION
#undef COMPLEXCASE
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#define COMPLEXCASE 1
#define SINGLE_PRECISION
#include "../general/precision_macros.h"
!> \brief elpa_mult_ah_b_complex_single_impl: Performs C : = A**H * B
!> where A is a square matrix (obj%na,obj%na) which is optionally upper or lower triangular
!> B is a (obj%na,ncb) matrix
!> C is a (obj%na,ncb) matrix where optionally only the upper or lower
!> triangle may be computed
!> \details
!>
!> \param uplo_a 'U' if A is upper triangular
!> 'L' if A is lower triangular
!> anything else if A is a full matrix
!> Please note: This pertains to the original A (as set in the calling program)
!> whereas the transpose of A is used for calculations
!> If uplo_a is 'U' or 'L', the other triangle is not used at all,
!> i.e. it may contain arbitrary numbers
!> \param uplo_c 'U' if only the upper diagonal part of C is needed
!> 'L' if only the upper diagonal part of C is needed
!> anything else if the full matrix C is needed
!> Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
!> written to a certain extent, i.e. one shouldn't rely on the content there!
!> \param na Number of rows/columns of A, number of rows of B and C
!> \param ncb Number of columns of B and C
!> \param a matrix a
!> \param lda leading dimension of matrix a
!> \param ldaCols columns of matrix a
!> \param b matrix b
!> \param ldb leading dimension of matrix b
!> \param ldbCols columns of matrix b
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!> \param mpi_comm_rows MPI communicator for rows
!> \param mpi_comm_cols MPI communicator for columns
!> \param c matrix c
!> \param ldc leading dimension of matrix c
!> \result success
function elpa_mult_ah_b_complex_single_impl(obj, uplo_a, uplo_c, ncb, a, b, ldb, ldbCols, &
c, ldc, ldcCols) result(success)
#include "elpa_multiply_a_b.F90"
end function elpa_mult_ah_b_complex_single_impl
#undef SINGLE_PRECISION
#undef COMPLEXCASE
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
#define REALCASE 1
#define DOUBLE_PRECISION
#include "../general/precision_macros.h"
......
......@@ -371,7 +371,7 @@ subroutine trans_ev_&
enddo
#ifdef WITH_MPI
if (nb>0) then
if (nb > 0) then
if (useNonBlockingCollectivesCols) then
call obj%timer%start("mpi_nbc_communication")
call mpi_ibcast(hvb, int(nb,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION , int(cur_pcol,kind=MPI_KIND), &
......@@ -422,7 +422,7 @@ subroutine trans_ev_&
nc = nc+n
enddo
#ifdef WITH_MPI
if (nc>0) then
if (nc > 0) then
if (useNonBlockingCollectivesRows) then
call obj%timer%start("mpi_nbc_communication")
call mpi_iallreduce( h1, h2, int(nc,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, MPI_SUM, &
......@@ -487,7 +487,7 @@ subroutine trans_ev_&
! Q = Q - V * T * V**T * Q
if (l_rows>0) then
if (l_rows > 0) then
if (useGPU) then
if (useIntelGPU) then
call obj%timer%start("mkl_offload")
......@@ -605,7 +605,7 @@ subroutine trans_ev_&
! tmp2 = tmp1
#endif /* WITH_MPI */
if (l_rows>0) then
if (l_rows > 0) then
if (useGPU) then
if (useIntelGPU) then
#ifdef WITH_MPI
......@@ -764,3 +764,4 @@ end subroutine trans_ev_&
&MATH_DATATYPE&
&_&
&PRECISION
......@@ -766,7 +766,7 @@ subroutine tridiag_&
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
endif
endif
endif
endif ! .not. useGPU
if (wantDebug) call obj%timer%stop("blas")
endif
n_iter = n_iter+1
......@@ -823,7 +823,7 @@ subroutine tridiag_&
if (wantDebug) call obj%timer%stop("mkl_offload")
else
else ! useIntelGPU
! Unlike for CPU, we (for each MPI thread) do just one large mat-vec multiplication
! this requires altering of the algorithm when later explicitly updating the matrix
! after max_stored_uv is reached : we need to update all tiles, not only those above diagonal
......@@ -843,52 +843,52 @@ subroutine tridiag_&
! size_of_datatype, 1)
! endif
if (wantDebug) call obj%timer%stop("gpublas")
endif
endif ! useIntelGPU
else ! mat_vec_as_one_block
!perform multiplication by stripes - it is faster than by blocks, since we call cublas with
!larger matrices. In general, however, this algorithm is very simmilar to the one with CPU
do i=0,(istep-2)/tile_size
l_col_beg = i*l_cols_per_tile+1
l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
if(l_col_end<l_col_beg) cycle
l_col_beg = i*l_cols_per_tile+1
l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
if (l_col_end<l_col_beg) cycle
l_row_beg = 1
l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
l_row_beg = 1
l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
if (useIntelGPU) then
if (wantDebug) call obj%timer%start("mkl_offload")
if (useIntelGPU) then
if (wantDebug) call obj%timer%start("mkl_offload")
#if 0
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), &
v_row(l_row_beg:max_local_rows+1), 1_BLAS_KIND, &
ONE, u_col(l_col_beg:max_local_cols), 1_BLAS_KIND)
#endif
#ifdef WITH_INTEL_GPU_VERSION
call mkl_offload_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
call mkl_offload_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg:matrixRows,l_col_beg:matrixCols), int(matrixRows,kind=BLAS_KIND), &
v_row(l_row_beg:max_local_rows+1), 1_BLAS_KIND, &
ONE, u_col(l_col_beg:max_local_cols), 1_BLAS_KIND)
#endif
if (wantDebug) call obj%timer%stop("mkl_offload")
if (wantDebug) call obj%timer%stop("mkl_offload")
else
a_offset = ((l_row_beg-1) + (l_col_beg - 1) * matrixRows) * &
else ! useIntelGPU
a_offset = ((l_row_beg-1) + (l_col_beg - 1) * matrixRows) * &
size_of_datatype
call gpublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
call gpublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_dev + a_offset, matrixRows, &
v_row_dev + (l_row_beg - 1) * size_of_datatype, 1, &
ONE, u_col_dev + (l_col_beg - 1) * size_of_datatype, 1)
endif
endif ! useIntelGPU
enddo
do i=0,(istep-2)/tile_size
l_col_beg = i*l_cols_per_tile+1
l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
if(l_col_end<l_col_beg) cycle
if (l_col_end<l_col_beg) cycle
l_row_beg = 1
l_row_end = min(l_rows,i*l_rows_per_tile)
......@@ -930,7 +930,7 @@ subroutine tridiag_&
if (wantDebug) call obj%timer%stop("mkl_offload")
else
else ! useIntelGPU
a_offset = ((l_row_beg-1) + (l_col_beg - 1) * matrixRows) * &
size_of_datatype
if (isSkewsymmetric) then
......@@ -944,7 +944,7 @@ subroutine tridiag_&
v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
endif
endif
endif ! useIntelGPU
enddo
end if !multiplication as one block / per stripes
......
......@@ -62,6 +62,10 @@
use elpa_gpu
use mod_check_for_gpu
use elpa_blas_interfaces
use ELPA_utilities, only : local_index, check_deallocate_f, check_dealloc_gpu_f, &
check_host_dealloc_gpu_f, check_alloc_gpu_f, check_host_alloc_gpu_f, &
check_host_unregister_gpu_f, check_memcpy_gpu_f, check_allocate_f, &
check_host_register_gpu_f, check_alloc
implicit none
#include "../../src/general/precision_kinds.F90"
......
! Copyright 2021, A. Marek
!
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
#include "config-f90.h"
module elpa_multiply_a_b
use, intrinsic :: iso_c_binding
use precision
implicit none
public
public :: elpa_mult_at_b_real_double_impl !< Multiply double-precision real matrices A**T * B
public :: elpa_mult_ah_b_complex_double_impl !< Multiply double-precision complex matrices A**H * B
#ifdef WANT_SINGLE_PRECISION_REAL
public :: elpa_mult_at_b_real_single_impl !< Multiply single-precision real matrices A**T * B
#endif
#ifdef WANT_SINGLE_PRECISION_COMPLEX
public :: elpa_mult_ah_b_complex_single_impl !< Multiply single-precision complex matrices A**H * B
#endif
contains
#define REALCASE 1
#define DOUBLE_PRECISION
#include "../general/precision_macros.h"
!> \brief elpa_mult_at_b_real_double_impl: Performs C : = A**T * B
!> where A is a square matrix (obj%na,obj%na) which is optionally upper or lower triangular
!> B is a (obj%na,ncb) matrix
!> C is a (obj%na,ncb) matrix where optionally only the upper or lower
!> triangle may be computed
!> \details
!>
!> \param uplo_a 'U' if A is upper triangular
!> 'L' if A is lower triangular
!> anything else if A is a full matrix
!> Please note: This pertains to the original A (as set in the calling program)
!> whereas the transpose of A is used for calculations
!> If uplo_a is 'U' or 'L', the other triangle is not used at all,
!> i.e. it may contain arbitrary numbers
!> \param uplo_c 'U' if only the upper diagonal part of C is needed
!> 'L' if only the upper diagonal part of C is needed
!> anything else if the full matrix C is needed
!> Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
!> written to a certain extent, i.e. one shouldn't rely on the content there!
!> \param na Number of rows/columns of A, number of rows of B and C
!> \param ncb Number of columns of B and C
!> \param a matrix a
!> \param obj%local_nrows leading dimension of matrix a, set with class method obj%set("local_nrows",value)
!> \param b matrix b
!> \param ldb leading dimension of matrix b
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!> \param mpi_comm_rows MPI communicator for rows
!> \param mpi_comm_cols MPI communicator for columns
!> \param c matrix c
!> \param ldc leading dimension of matrix c
!> \result success
function elpa_mult_at_b_real_double_impl(obj, uplo_a, uplo_c, ncb, a, b, ldb, ldbCols, &
c, ldc, ldcCols) result(success)
#include "elpa_multiply_a_b_template.F90"
end function elpa_mult_at_b_real_double_impl
#undef DOUBLE_PRECISION
#undef REALCASE
#ifdef WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION
#include "../general/precision_macros.h"
!> \brief elpa_mult_at_b_real_single_impl: Performs C : = A**T * B
!> where A is a square matrix (obj%na,obj%na) which is optionally upper or lower triangular
!> B is a (obj%na,ncb) matrix
!> C is a (obj%na,ncb) matrix where optionally only the upper or lower
!> triangle may be computed
!> \details
!>
!> \param uplo_a 'U' if A is upper triangular
!> 'L' if A is lower triangular
!> anything else if A is a full matrix
!> Please note: This pertains to the original A (as set in the calling program)
!> whereas the transpose of A is used for calculations
!> If uplo_a is 'U' or 'L', the other triangle is not used at all,
!> i.e. it may contain arbitrary numbers
!> \param uplo_c 'U' if only the upper diagonal part of C is needed
!> 'L' if only the upper diagonal part of C is needed
!> anything else if the full matrix C is needed
!> Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
!> written to a certain extent, i.e. one shouldn't rely on the content there!
!> \param na Number of rows/columns of A, number of rows of B and C
!> \param ncb Number of columns of B and C
!> \param a matrix a
!> \param obj%local_nrows leading dimension of matrix a, set with class method obj%set("local_nrows",value)
!> \param b matrix b
!> \param ldb leading dimension of matrix b
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!> \param mpi_comm_rows MPI communicator for rows
!> \param mpi_comm_cols MPI communicator for columns
!> \param c matrix c
!> \param ldc leading dimension of matrix c
!> \result success
function elpa_mult_at_b_real_single_impl(obj, uplo_a, uplo_c, ncb, a, b, ldb, ldbCols, &
c, ldc, ldcCols) result(success)
#include "elpa_multiply_a_b_template.F90"
end function elpa_mult_at_b_real_single_impl
#undef SINGLE_PRECISION
#undef REALCASE
#endif /* WANT_SINGLE_PRECSION_REAL */
#define COMPLEXCASE 1
#define DOUBLE_PRECISION
#include "../general/precision_macros.h"
!> \brief elpa_mult_ah_b_complex_double_impl: Performs C : = A**H * B
!> where A is a square matrix (obj%na,obj%na) which is optionally upper or lower triangular
!> B is a (obj%na,ncb) matrix
!> C is a (obj%na,ncb) matrix where optionally only the upper or lower
!> triangle may be computed
!> \details
!>
!> \param uplo_a 'U' if A is upper triangular
!> 'L' if A is lower triangular
!> anything else if A is a full matrix
!> Please note: This pertains to the original A (as set in the calling program)
!> whereas the transpose of A is used for calculations
!> If uplo_a is 'U' or 'L', the other triangle is not used at all,
!> i.e. it may contain arbitrary numbers
!> \param uplo_c 'U' if only the upper diagonal part of C is needed
!> 'L' if only the upper diagonal part of C is needed
!> anything else if the full matrix C is needed
!> Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
!> written to a certain extent, i.e. one shouldn't rely on the content there!
!> \param na Number of rows/columns of A, number of rows of B and C
!> \param ncb Number of columns of B and C
!> \param a matrix a
!> \param obj%local_ncols leading dimension of matrix a, set with class method obj%set("local_nrows",value)
!> \param ldaCols columns of matrix a
!> \param b matrix b
!> \param ldb leading dimension of matrix b
!> \param ldbCols columns of matrix b
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!> \param mpi_comm_rows MPI communicator for rows
!> \param mpi_comm_cols MPI communicator for columns
!> \param c matrix c
!> \param ldc leading dimension of matrix c
!> \result success
function elpa_mult_ah_b_complex_double_impl(obj, uplo_a, uplo_c, ncb, a, b, ldb, ldbCols, &
c, ldc, ldcCols) result(success)
#include "elpa_multiply_a_b_template.F90"
end function elpa_mult_ah_b_complex_double_impl
#undef DOUBLE_PRECISION
#undef COMPLEXCASE
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#define COMPLEXCASE 1
#define SINGLE_PRECISION
#include "../general/precision_macros.h"
!> \brief elpa_mult_ah_b_complex_single_impl: Performs C : = A**H * B
!> where A is a square matrix (obj%na,obj%na) which is optionally upper or lower triangular
!> B is a (obj%na,ncb) matrix
!> C is a (obj%na,ncb) matrix where optionally only the upper or lower
!> triangle may be computed
!> \details
!>
!> \param uplo_a 'U' if A is upper triangular
!> 'L' if A is lower triangular
!> anything else if A is a full matrix
!> Please note: This pertains to the original A (as set in the calling program)
!> whereas the transpose of A is used for calculations
!> If uplo_a is 'U' or 'L', the other triangle is not used at all,
!> i.e. it may contain arbitrary numbers
!> \param uplo_c 'U' if only the upper diagonal part of C is needed
!> 'L' if only the upper diagonal part of C is needed
!> anything else if the full matrix C is needed
!> Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
!> written to a certain extent, i.e. one shouldn't rely on the content there!
!> \param na Number of rows/columns of A, number of rows of B and C
!> \param ncb Number of columns of B and C
!> \param a matrix a
!> \param obj%local_ncols leading dimension of matrix a, set with class method obj%set("local_nrows",value)
!> \param ldaCols columns of matrix a
!> \param b matrix b
!> \param ldb leading dimension of matrix b
!> \param ldbCols columns of matrix b
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!> \param mpi_comm_rows MPI communicator for rows
!> \param mpi_comm_cols MPI communicator for columns
!> \param c matrix c
!> \param ldc leading dimension of matrix c
!> \result success
function elpa_mult_ah_b_complex_single_impl(obj, uplo_a, uplo_c, ncb, a, b, ldb, ldbCols, &
c, ldc, ldcCols) result(success)
#include "elpa_multiply_a_b_template.F90"
end function elpa_mult_ah_b_complex_single_impl
#undef SINGLE_PRECISION
#undef COMPLEXCASE
#endif /* WANT_SINGLE_PRECISION_COMPLEX */