Update of configure and documentation for new AVX2 features

parent 44244df1
......@@ -108,6 +108,9 @@ The configure installation is best done in four steps
if the hardware does. If you already included "-mAVX" in the
flags, you can skip "-march=native".
If you want to use the newer AVX2 instructions, assuming they are supported on
your hardware, please set CFLAGS="-march=avx2 -mfma" and CXXFLAGS="-march=avx2 -mfma".
Setting the optimization flags for the AVX kernels can be a hassle. If AVX
kernels are build for your system, you can set the configure option
......@@ -119,6 +122,9 @@ The configure installation is best done in four steps
./configure CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O2"
An istallation with AVX2 and best-optimizations could thus look like this:
./configure CFLAGS="-O2 -mavx2 -mfma" CXXFLAGS="-O2 -mavx2 -mfma" FCFLAGS="-O2" --with-avx-optimization
1.4 Installation location
......@@ -161,6 +167,10 @@ The configure installation is best done in four steps
new feature. With the same thought in mind, a binary "elpa2_print_kernels"
is provided, which is rather self-explanatory.
Also some of the above mentioned tests are provided as C source files.
These should demonstrate, how to call ELPA from a C program (i.e. which headers to include
and call the ELPA functions). They are NOT intended as a copy and paste solution!
4) run "make install"
......
......@@ -231,6 +231,43 @@ if test "${can_compile_avx}" = "yes" ; then
fi
fi
AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
double* q;
__m256d q1 = _mm256_load_pd(q);
__m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
return 0;
}
])],
[can_compile_avx2=yes],
[can_compile_avx2=no]
)
AC_MSG_RESULT([${can_compile_avx2}])
if test "${can_compile_avx2}" = "yes" ; then
AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C++])
AC_LANG_PUSH([C++])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
double* q;
__m256d q1 = _mm256_load_pd(q);
__m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
return 0;
}
])],
[can_compile_avx2=yes],
[can_compile_avx2=no]
)
AC_LANG_POP([C++])
AC_MSG_RESULT([${can_compile_avx2}])
if test "${can_compile_avx2}" = "no" ; then
AC_MSG_WARN([Cannot compile C++ with AVX2!])
fi
fi
if test "${can_compile_avx}" = "yes" ; then
install_real_avx_block2=yes
install_real_avx_block4=yes
......@@ -687,4 +724,9 @@ if test "${can_compile_avx}" = "no" ; then
AC_MSG_WARN([Could not compile AVX instructions])
fi
fi
if test "${can_compile_avx2}" = "no" ; then
if test x"${want_avx}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX2 instructions])
fi
fi
......@@ -15,7 +15,7 @@
! - IBM Deutschland GmbH
!
! This particular source code file contains additions, changes and
! enhancements authored by Intel Corporation which is not part of
! enhancements authored by Intel Corporation which is not part of
! the ELPA consortium.
!
! More information can be found here:
......@@ -1125,7 +1125,7 @@ end subroutine mult_at_b_real
#define BYTESIZE 16
#define COMPLEXCASE 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
......
......@@ -937,7 +937,7 @@ subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_r
lre = min(l_rows,(i+1)*l_rows_tile)
call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,dim=1), &
vmr,ubound(vmr,dim=1),1.d0,umc(lcs,dim=1),ubound(umc,dim=1))
vmr,ubound(vmr,dim=1),1.d0,umc(lcs,1),ubound(umc,dim=1))
if (i==0) cycle
lre = min(l_rows,i*l_rows_tile)
......@@ -955,9 +955,9 @@ subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_r
! global tile size is smaller than the global remaining matrix
! Or if we used the Algorithm 4
if (tile_size < istep*nbw .or. n_way > 1) then
call elpa_reduce_add_vectors (vmr(1,n_cols+1),ubound(vmr,dim=1),mpi_comm_rows, &
umc, ubound(umc,dim=11), mpi_comm_cols, &
istep*nbw, n_cols, nblk)
call elpa_reduce_add_vectors_real (vmr(1,n_cols+1),ubound(vmr,dim=1),mpi_comm_rows, &
umc, ubound(umc,dim=1), mpi_comm_cols, &
istep*nbw, n_cols, nblk)
endif
if (l_cols>0) then
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment