Commit c38695bd authored by Andreas Marek's avatar Andreas Marek

Double precision real block6 kernel for Sparc64

parent 46ed16fd
......@@ -244,12 +244,12 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
#if WITH_REAL_SPARC64_BLOCK4_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_double_precision.c
if WITH_REAL_SPARC64_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_single_precision.c
#endif
#endif
endif
if WITH_REAL_VSX_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c
......@@ -288,9 +288,9 @@ endif
if WITH_REAL_SPARC64_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_single_precision.c
endif
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_single_precision.c
#endif
endif
if WITH_REAL_VSX_BLOCK6_KERNEL
......@@ -689,6 +689,9 @@ EXTRA_DIST = \
src/elpa2/kernels/real_vsx_2hv_template.c \
src/elpa2/kernels/real_vsx_4hv_template.c \
src/elpa2/kernels/real_vsx_6hv_template.c \
src/elpa2/kernels/real_sparc64_2hv_template.c \
src/elpa2/kernels/real_sparc64_4hv_template.c \
src/elpa2/kernels/real_sparc64_6hv_template.c \
src/elpa2/kernels/real_sse_2hv_template.c \
src/elpa2/kernels/real_sse_4hv_template.c \
src/elpa2/kernels/real_sse_6hv_template.c \
......
......@@ -470,8 +470,6 @@ m4_define(elpa_m4_sparc64_kernels, [
real_sparc64_block2
real_sparc64_block4
real_sparc64_block6
complex_sparc64_block1
complex_sparc64_block2
])
m4_define(elpa_m4_vsx_kernels, [
......
......@@ -72,9 +72,7 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1, 15, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2, 16, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
......
......@@ -628,32 +628,32 @@
! sparc64 block1 complex kernel
#if defined(WITH_COMPLEX_SPARC64_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL))
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP
call single_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)) */
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!#ifndef WITH_FIXED_COMPLEX_KERNEL
! if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1) then
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
!#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL))
! ttt = mpi_wtime()
! do j = ncols, 1, -1
!#ifdef WITH_OPENMP
! call single_hh_trafo_&
! &MATH_DATATYPE&
! &_sparc64_1hv_&
! &PRECISION&
! & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#else
! call single_hh_trafo_&
! &MATH_DATATYPE&
! &_sparc64_1hv_&
! &PRECISION&
! & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
! enddo
!#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)) */
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
! endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1)
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SPARC64_BLOCK1_KERNEL */
#endif /* COMPLEXCASE */
......@@ -918,45 +918,45 @@
! implementation of sparc64 block 2 complex case
#if defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (j==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_1hv_&
&PRECISION&
& (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
if (j==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_sparc64_1hv_&
&PRECISION&
& (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!#ifndef WITH_FIXED_COMPLEX_KERNEL
! if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2) then
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
! ttt = mpi_wtime()
! do j = ncols, 2, -2
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
!#ifdef WITH_OPENMP
! call double_hh_trafo_&
! &MATH_DATATYPE&
! &_sparc64_2hv_&
! &PRECISION&
! & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
!#else
! call double_hh_trafo_&
! &MATH_DATATYPE&
! &_sparc64_2hv_&
! &PRECISION&
! & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
!#endif
! enddo
!#ifdef WITH_OPENMP
! if (j==1) call single_hh_trafo_&
! &MATH_DATATYPE&
! &_sparc64_1hv_&
! &PRECISION&
! & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
!#else
! if (j==1) call single_hh_trafo_&
! &MATH_DATATYPE&
! &_sparc64_1hv_&
! &PRECISION&
! & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
!#endif
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
! endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2)
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SPARC64_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */
......
......@@ -192,6 +192,14 @@
write(error_unit,*) "The GENERIC kernel will be used at the moment"
kernel = ELPA_2STAGE_REAL_GENERIC
endif
! special case at the moment NO single precision kernels on SPARC64 -> set GENERIC for now
if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK6 ) then
write(error_unit,*) "ELPA: At the moment there exist no specific SINGLE precision kernels for SPARC64"
write(error_unit,*) "The GENERIC kernel will be used at the moment"
kernel = ELPA_2STAGE_REAL_GENERIC
endif
#endif
#endif
......
This diff is collapsed.
......@@ -759,7 +759,7 @@ __forceinline void hh_trafo_kernel_12_SPARC64_4hv_single(float* q, float* hh, in
h1 = tau1;
x1 = _SSE_MUL(x1, h1);
x2 = _SSE_MUL(x2, h1);
x3 = _SSE_MUL(x3, h1)
x3 = _SSE_MUL(x3, h1);
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
......@@ -944,7 +944,7 @@ __forceinline void hh_trafo_kernel_12_SPARC64_4hv_single(float* q, float* hh, in
#endif
#endif
#ifdef HAVE_SPARC64_INTRINSICS
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
h3 = _mm_set_pd(hh[(ldh*2)+1], hh[(ldh*2)+1]);
#endif
......@@ -1164,7 +1164,7 @@ __forceinline void hh_trafo_kernel_12_SPARC64_4hv_single(float* q, float* hh, in
q2 = _SSE_SUB(q2, _SSE_MUL(x2, h1));
q3 = _SSE_SUB(q3, _SSE_MUL(x3, h1));
#ifdef HAVE_SSE_INTRINSCS
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
h2 = _mm_set1_pd(hh[ldh+nb-2]);
#endif
......@@ -2248,7 +2248,7 @@ __forceinline void hh_trafo_kernel_4_SPARC64_4hv_single(float* q, float* hh, int
#endif
#endif
#ifdef HAVE_SPARC64
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
h2 = _mm_set_pd(hh[ldh+1], hh[ldh+1]);
h3 = _mm_set_pd(hh[(ldh*2)+2], hh[(ldh*2)+2]);
......
This diff is collapsed.
......@@ -57,7 +57,7 @@ module elpa2_utilities
public
integer(kind=c_int), parameter :: number_of_real_kernels = ELPA_2STAGE_NUMBER_OF_REAL_KERNELS - 6
integer(kind=c_int), parameter :: number_of_complex_kernels = ELPA_2STAGE_NUMBER_OF_COMPLEX_KERNELS - 4
integer(kind=c_int), parameter :: number_of_complex_kernels = ELPA_2STAGE_NUMBER_OF_COMPLEX_KERNELS
#ifdef WITH_REAL_GENERIC_KERNEL
integer(kind=c_int), parameter :: REAL_ELPA_KERNEL_GENERIC = ELPA_2STAGE_REAL_GENERIC
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment