Commit 1e1812bc authored by Andreas Marek's avatar Andreas Marek

Single precision kernel for AVX512 real block6

parent 5225392a
...@@ -2192,6 +2192,19 @@ intel-double-precision-mpi-noopenmp-ftimings-redirect-real-avx512_block6-complex ...@@ -2192,6 +2192,19 @@ intel-double-precision-mpi-noopenmp-ftimings-redirect-real-avx512_block6-complex
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1000 500 128' - make check TEST_FLAGS='1000 500 128'
intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx512_block6-complex-avx512_block1-kernel-jobs:
tags:
- KNL
script:
- ./autogen.sh
- ./configure FC=mpiifort CC=mpiicc CFLAGS="-O3 -mtune=knl -axMIC-AVX512" FCFLAGS="-O3 -mtune=knl -axMIC-AVX512" SCALAPACK_FCFLAGS="-L$MKLROOT/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" SCALAPACK_LDFLAGS="-L$MKLROOT/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKLROOT/lib/intel64" --with-real-avx512_block6-kernel-only --with-complex-avx512_block1-kernel-only --enable-single-precision
- /home/elpa/wait_until_midnight.sh
- make -j 8
- export OMP_NUM_THREADS=1
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1000 500 128'
intel-set-kernel-via-environment-variable-mpi-openmp-job: intel-set-kernel-via-environment-variable-mpi-openmp-job:
tags: tags:
- cpu - cpu
......
...@@ -204,9 +204,9 @@ endif ...@@ -204,9 +204,9 @@ endif
if WITH_REAL_AVX512_BLOCK6_KERNEL if WITH_REAL_AVX512_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_single_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_single_precision.c
#endif endif
endif endif
......
...@@ -495,7 +495,7 @@ __forceinline void hh_trafo_kernel_64_AVX512_4hv_single(float* q, float* hh, int ...@@ -495,7 +495,7 @@ __forceinline void hh_trafo_kernel_64_AVX512_4hv_single(float* q, float* hh, int
q3 = _mm512_NFMA_ps(x3, h1, q3); q3 = _mm512_NFMA_ps(x3, h1, q3);
q3 = _mm512_NFMA_ps(y3, h2, q3); q3 = _mm512_NFMA_ps(y3, h2, q3);
q3 = _mm512_NFMA_ps(z3, h3, q3); q3 = _mm512_NFMA_ps(z3, h3, q3);
q3 = _mm512_NFMA_pd(w3, h4, q3); q3 = _mm512_NFMA_ps(w3, h4, q3);
_mm512_store_ps(&q[(i*ldq)+32],q3); _mm512_store_ps(&q[(i*ldq)+32],q3);
q4 = _mm512_load_ps(&q[(i*ldq)+48]); q4 = _mm512_load_ps(&q[(i*ldq)+48]);
...@@ -932,7 +932,7 @@ __forceinline void hh_trafo_kernel_48_AVX512_4hv_single(float* q, float* hh, int ...@@ -932,7 +932,7 @@ __forceinline void hh_trafo_kernel_48_AVX512_4hv_single(float* q, float* hh, int
q3 = _mm512_NFMA_ps(x3, h1, q3); q3 = _mm512_NFMA_ps(x3, h1, q3);
q3 = _mm512_NFMA_ps(y3, h2, q3); q3 = _mm512_NFMA_ps(y3, h2, q3);
q3 = _mm512_NFMA_ps(z3, h3, q3); q3 = _mm512_NFMA_ps(z3, h3, q3);
q3 = _mm512_NFMA_pd(w3, h4, q3); q3 = _mm512_NFMA_ps(w3, h4, q3);
_mm512_store_ps(&q[(i*ldq)+32],q3); _mm512_store_ps(&q[(i*ldq)+32],q3);
// q4 = _mm512_load_ps(&q[(i*ldq)+48]); // q4 = _mm512_load_ps(&q[(i*ldq)+48]);
...@@ -1369,7 +1369,7 @@ __forceinline void hh_trafo_kernel_32_AVX512_4hv_single(float* q, float* hh, int ...@@ -1369,7 +1369,7 @@ __forceinline void hh_trafo_kernel_32_AVX512_4hv_single(float* q, float* hh, int
// q3 = _mm512_NFMA_ps(x3, h1, q3); // q3 = _mm512_NFMA_ps(x3, h1, q3);
// q3 = _mm512_NFMA_ps(y3, h2, q3); // q3 = _mm512_NFMA_ps(y3, h2, q3);
// q3 = _mm512_NFMA_ps(z3, h3, q3); // q3 = _mm512_NFMA_ps(z3, h3, q3);
// q3 = _mm512_NFMA_pd(w3, h4, q3); // q3 = _mm512_NFMA_ps(w3, h4, q3);
// _mm512_store_ps(&q[(i*ldq)+32],q3); // _mm512_store_ps(&q[(i*ldq)+32],q3);
// q4 = _mm512_load_ps(&q[(i*ldq)+48]); // q4 = _mm512_load_ps(&q[(i*ldq)+48]);
...@@ -1806,7 +1806,7 @@ __forceinline void hh_trafo_kernel_16_AVX512_4hv_single(float* q, float* hh, int ...@@ -1806,7 +1806,7 @@ __forceinline void hh_trafo_kernel_16_AVX512_4hv_single(float* q, float* hh, int
// q3 = _mm512_NFMA_ps(x3, h1, q3); // q3 = _mm512_NFMA_ps(x3, h1, q3);
// q3 = _mm512_NFMA_ps(y3, h2, q3); // q3 = _mm512_NFMA_ps(y3, h2, q3);
// q3 = _mm512_NFMA_ps(z3, h3, q3); // q3 = _mm512_NFMA_ps(z3, h3, q3);
// q3 = _mm512_NFMA_pd(w3, h4, q3); // q3 = _mm512_NFMA_ps(w3, h4, q3);
// _mm512_store_ps(&q[(i*ldq)+32],q3); // _mm512_store_ps(&q[(i*ldq)+32],q3);
// q4 = _mm512_load_ps(&q[(i*ldq)+48]); // q4 = _mm512_load_ps(&q[(i*ldq)+48]);
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment