Commit 5225392a authored by Andreas Marek's avatar Andreas Marek

Single precision kernel for AVX512 real block4

parent 6a210b6a
...@@ -2167,6 +2167,19 @@ intel-double-precision-mpi-noopenmp-ftimings-redirect-real-avx512_block4-complex ...@@ -2167,6 +2167,19 @@ intel-double-precision-mpi-noopenmp-ftimings-redirect-real-avx512_block4-complex
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1000 500 128' - make check TEST_FLAGS='1000 500 128'
intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx512_block4-complex-avx512_block2-kernel-jobs:
tags:
- KNL
script:
- ./autogen.sh
- ./configure FC=mpiifort CC=mpiicc CFLAGS="-O3 -mtune=knl -axMIC-AVX512" FCFLAGS="-O3 -mtune=knl -axMIC-AVX512" SCALAPACK_FCFLAGS="-L$MKLROOT/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" SCALAPACK_LDFLAGS="-L$MKLROOT/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKLROOT/lib/intel64" --with-real-avx512_block4-kernel-only --with-complex-avx512_block2-kernel-only --enable-single-precision
- /home/elpa/wait_until_midnight.sh
- make -j 8
- export OMP_NUM_THREADS=1
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1000 500 128'
intel-double-precision-mpi-noopenmp-ftimings-redirect-real-avx512_block6-complex-avx512_block1-kernel-jobs: intel-double-precision-mpi-noopenmp-ftimings-redirect-real-avx512_block6-complex-avx512_block1-kernel-jobs:
tags: tags:
- KNL - KNL
......
...@@ -174,9 +174,9 @@ endif ...@@ -174,9 +174,9 @@ endif
if WITH_REAL_AVX512_BLOCK4_KERNEL if WITH_REAL_AVX512_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_4hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_4hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_4hv_single_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_4hv_single_precision.c
#endif endif
endif endif
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -1241,52 +1241,52 @@ module compute_hh_trafo_real ...@@ -1241,52 +1241,52 @@ module compute_hh_trafo_real
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK4_KERNEL || WITH_REAL_AVX2_BLOCK4_KERNEL */ #endif /* WITH_REAL_AVX_BLOCK4_KERNEL || WITH_REAL_AVX2_BLOCK4_KERNEL */
!#if defined(WITH_REAL_AVX512_BLOCK4_KERNEL) #if defined(WITH_REAL_AVX512_BLOCK4_KERNEL)
!#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
! if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK4) then if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK4) then
!#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
!
!#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK6_KERNEL)) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK6_KERNEL))
! ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
! do j = ncols, 4, -4 do j = ncols, 4, -4
! w(:,1) = bcast_buffer(1:nbw,j+off) w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1) w(:,2) = bcast_buffer(1:nbw,j+off-1)
! w(:,3) = bcast_buffer(1:nbw,j+off-2) w(:,3) = bcast_buffer(1:nbw,j+off-2)
! w(:,4) = bcast_buffer(1:nbw,j+off-3) w(:,4) = bcast_buffer(1:nbw,j+off-3)
!#ifdef WITH_OPENMP #ifdef WITH_OPENMP
! call quad_hh_trafo_real_avx512_4hv_single(c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, & call quad_hh_trafo_real_avx512_4hv_single(c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, &
! nbw, nl, stripe_width, nbw) nbw, nl, stripe_width, nbw)
!#else #else
! call quad_hh_trafo_real_avx512_4hv_single(c_loc(a(1,j+off+a_off-3,istripe)), w, & call quad_hh_trafo_real_avx512_4hv_single(c_loc(a(1,j+off+a_off-3,istripe)), w, &
! nbw, nl, stripe_width, nbw) nbw, nl, stripe_width, nbw)
!#endif #endif
! enddo enddo
! do jj = j, 2, -2 do jj = j, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jj+off) w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1) w(:,2) = bcast_buffer(1:nbw,jj+off-1)
!#ifdef WITH_OPENMP #ifdef WITH_OPENMP
! call double_hh_trafo_real_avx512_2hv_single(c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), & call double_hh_trafo_real_avx512_2hv_single(c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), &
! w, nbw, nl, stripe_width, nbw) w, nbw, nl, stripe_width, nbw)
!#else #else
! call double_hh_trafo_real_avx512_2hv_single(c_loc(a(1,jj+off+a_off-1,istripe)), & call double_hh_trafo_real_avx512_2hv_single(c_loc(a(1,jj+off+a_off-1,istripe)), &
! w, nbw, nl, stripe_width, nbw) w, nbw, nl, stripe_width, nbw)
!#endif #endif
! enddo enddo
!#ifdef WITH_OPENMP #ifdef WITH_OPENMP
! if (jj==1) call single_hh_trafo_real_cpu_openmp_single(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, & if (jj==1) call single_hh_trafo_real_cpu_openmp_single(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, &
! istripe,my_thread), & istripe,my_thread), &
! bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
!#else #else
! if (jj==1) call single_hh_trafo_real_cpu_single(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), & if (jj==1) call single_hh_trafo_real_cpu_single(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
! bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
!#endif #endif
!
!#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) ) */ #endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) ) */
!
!#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
! endif endif
!#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
!#endif /* WITH_REAL_AVX512_BLOCK4_KERNEL */ #endif /* WITH_REAL_AVX512_BLOCK4_KERNEL */
#if defined(WITH_REAL_SSE_BLOCK6_KERNEL) #if defined(WITH_REAL_SSE_BLOCK6_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment