Unverified Commit 70d37e0a authored by Andreas Marek's avatar Andreas Marek
Browse files

Update of configure and documentation for new AVX2 features

parent 44244df1
...@@ -108,6 +108,9 @@ The configure installation is best done in four steps ...@@ -108,6 +108,9 @@ The configure installation is best done in four steps
if the hardware does. If you already included "-mAVX" in the if the hardware does. If you already included "-mAVX" in the
flags, you can skip "-march=native". flags, you can skip "-march=native".
If you want to use the newer AVX2 instructions, assuming they are supported on
your hardware, please set CFLAGS="-march=avx2 -mfma" and CXXFLAGS="-march=avx2 -mfma".
Setting the optimization flags for the AVX kernels can be a hassle. If AVX Setting the optimization flags for the AVX kernels can be a hassle. If AVX
kernels are build for your system, you can set the configure option kernels are build for your system, you can set the configure option
...@@ -119,6 +122,9 @@ The configure installation is best done in four steps ...@@ -119,6 +122,9 @@ The configure installation is best done in four steps
./configure CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O2" ./configure CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O2"
An istallation with AVX2 and best-optimizations could thus look like this:
./configure CFLAGS="-O2 -mavx2 -mfma" CXXFLAGS="-O2 -mavx2 -mfma" FCFLAGS="-O2" --with-avx-optimization
1.4 Installation location 1.4 Installation location
...@@ -161,6 +167,10 @@ The configure installation is best done in four steps ...@@ -161,6 +167,10 @@ The configure installation is best done in four steps
new feature. With the same thought in mind, a binary "elpa2_print_kernels" new feature. With the same thought in mind, a binary "elpa2_print_kernels"
is provided, which is rather self-explanatory. is provided, which is rather self-explanatory.
Also some of the above mentioned tests are provided as C source files.
These should demonstrate, how to call ELPA from a C program (i.e. which headers to include
and call the ELPA functions). They are NOT intended as a copy and paste solution!
4) run "make install" 4) run "make install"
......
...@@ -231,6 +231,43 @@ if test "${can_compile_avx}" = "yes" ; then ...@@ -231,6 +231,43 @@ if test "${can_compile_avx}" = "yes" ; then
fi fi
fi fi
AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
double* q;
__m256d q1 = _mm256_load_pd(q);
__m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
return 0;
}
])],
[can_compile_avx2=yes],
[can_compile_avx2=no]
)
AC_MSG_RESULT([${can_compile_avx2}])
if test "${can_compile_avx2}" = "yes" ; then
AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C++])
AC_LANG_PUSH([C++])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
double* q;
__m256d q1 = _mm256_load_pd(q);
__m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
return 0;
}
])],
[can_compile_avx2=yes],
[can_compile_avx2=no]
)
AC_LANG_POP([C++])
AC_MSG_RESULT([${can_compile_avx2}])
if test "${can_compile_avx2}" = "no" ; then
AC_MSG_WARN([Cannot compile C++ with AVX2!])
fi
fi
if test "${can_compile_avx}" = "yes" ; then if test "${can_compile_avx}" = "yes" ; then
install_real_avx_block2=yes install_real_avx_block2=yes
install_real_avx_block4=yes install_real_avx_block4=yes
...@@ -687,4 +724,9 @@ if test "${can_compile_avx}" = "no" ; then ...@@ -687,4 +724,9 @@ if test "${can_compile_avx}" = "no" ; then
AC_MSG_WARN([Could not compile AVX instructions]) AC_MSG_WARN([Could not compile AVX instructions])
fi fi
fi fi
if test "${can_compile_avx2}" = "no" ; then
if test x"${want_avx}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX2 instructions])
fi
fi
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
! - IBM Deutschland GmbH ! - IBM Deutschland GmbH
! !
! This particular source code file contains additions, changes and ! This particular source code file contains additions, changes and
! enhancements authored by Intel Corporation which is not part of ! enhancements authored by Intel Corporation which is not part of
! the ELPA consortium. ! the ELPA consortium.
! !
! More information can be found here: ! More information can be found here:
...@@ -1125,7 +1125,7 @@ end subroutine mult_at_b_real ...@@ -1125,7 +1125,7 @@ end subroutine mult_at_b_real
#define BYTESIZE 16 #define BYTESIZE 16
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#include "elpa_transpose_vectors.X90" #include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90" #include "elpa_reduce_add_vectors.X90"
#undef DATATYPE #undef DATATYPE
#undef BYTESIZE #undef BYTESIZE
#undef COMPLEXCASE #undef COMPLEXCASE
......
...@@ -937,7 +937,7 @@ subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_r ...@@ -937,7 +937,7 @@ subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_r
lre = min(l_rows,(i+1)*l_rows_tile) lre = min(l_rows,(i+1)*l_rows_tile)
call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,dim=1), & call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,dim=1), &
vmr,ubound(vmr,dim=1),1.d0,umc(lcs,dim=1),ubound(umc,dim=1)) vmr,ubound(vmr,dim=1),1.d0,umc(lcs,1),ubound(umc,dim=1))
if (i==0) cycle if (i==0) cycle
lre = min(l_rows,i*l_rows_tile) lre = min(l_rows,i*l_rows_tile)
...@@ -955,9 +955,9 @@ subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_r ...@@ -955,9 +955,9 @@ subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_r
! global tile size is smaller than the global remaining matrix ! global tile size is smaller than the global remaining matrix
! Or if we used the Algorithm 4 ! Or if we used the Algorithm 4
if (tile_size < istep*nbw .or. n_way > 1) then if (tile_size < istep*nbw .or. n_way > 1) then
call elpa_reduce_add_vectors (vmr(1,n_cols+1),ubound(vmr,dim=1),mpi_comm_rows, & call elpa_reduce_add_vectors_real (vmr(1,n_cols+1),ubound(vmr,dim=1),mpi_comm_rows, &
umc, ubound(umc,dim=11), mpi_comm_cols, & umc, ubound(umc,dim=1), mpi_comm_cols, &
istep*nbw, n_cols, nblk) istep*nbw, n_cols, nblk)
endif endif
if (l_cols>0) then if (l_cols>0) then
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment