Commit 7329e8f8 authored by Lorenz Huedepohl's avatar Lorenz Huedepohl

Make separate test programs for single kernel and loop

parent 06493879
......@@ -483,7 +483,8 @@ m4_define(elpa_m4_bgq_kernels, [
])
m4_define(elpa_m4_gpu_kernels, [
gpu
real_gpu
complex_gpu
])
m4_define(elpa_m4_kernel_types, [generic sse sse_assembly avx avx2 avx512 bgp bgq gpu])
......@@ -573,7 +574,8 @@ if test x"$with_gpu_support_only" = x"yes" ; then
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
use_[]elpa_m4_kernel[]=no
])
use_gpu=yes
use_real_gpu=yes
use_complex_gpu=yes
fi
......@@ -851,9 +853,10 @@ m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
])
AM_CONDITIONAL([WITH_GPU_VERSION],[test x"$use_gpu" = x"yes"])
if test x"${use_gpu}" = x"yes" ; then
AM_CONDITIONAL([WITH_GPU_VERSION],[test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes"])
if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_DEFINE([WITH_GPU_VERSION],[1],[enable GPU support])
AC_DEFINE([WITH_GPU_KERNEL],[1],[GPU kernel should be build])
ELPA_2STAGE_COMPLEX_GPU_COMPILED=1
ELPA_2STAGE_REAL_GPU_COMPILED=1
else
......
......@@ -22,25 +22,25 @@ enum ELPA_SOLVERS {
#define ELPA_NUMBER_OF_SOLVERS (0 ELPA_FOR_ALL_SOLVERS(ELPA_ENUM_SUM))
/* Kernel constants */
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
X(ELPA_2STAGE_REAL_GENERIC, 1, @ELPA_2STAGE_REAL_GENERIC_COMPILED@) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE, 2, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_COMPILED@) \
X(ELPA_2STAGE_REAL_BGP, 3, @ELPA_2STAGE_REAL_BGP_COMPILED@) \
X(ELPA_2STAGE_REAL_BGQ, 4, @ELPA_2STAGE_REAL_BGQ_COMPILED@) \
X(ELPA_2STAGE_REAL_SSE_ASSEMBLY, 5, @ELPA_2STAGE_REAL_SSE_ASSEMBLY_COMPILED@) \
X(ELPA_2STAGE_REAL_SSE_BLOCK2, 6, @ELPA_2STAGE_REAL_SSE_BLOCK2_COMPILED@) \
X(ELPA_2STAGE_REAL_SSE_BLOCK4, 7, @ELPA_2STAGE_REAL_SSE_BLOCK4_COMPILED@) \
X(ELPA_2STAGE_REAL_SSE_BLOCK6, 8, @ELPA_2STAGE_REAL_SSE_BLOCK6_COMPILED@) \
X(ELPA_2STAGE_REAL_AVX_BLOCK2, 9, @ELPA_2STAGE_REAL_AVX_BLOCK2_COMPILED@) \
X(ELPA_2STAGE_REAL_AVX_BLOCK4, 10, @ELPA_2STAGE_REAL_AVX_BLOCK4_COMPILED@) \
X(ELPA_2STAGE_REAL_AVX_BLOCK6, 11, @ELPA_2STAGE_REAL_AVX_BLOCK6_COMPILED@) \
X(ELPA_2STAGE_REAL_AVX2_BLOCK2, 12, @ELPA_2STAGE_REAL_AVX2_BLOCK2_COMPILED@) \
X(ELPA_2STAGE_REAL_AVX2_BLOCK4, 13, @ELPA_2STAGE_REAL_AVX2_BLOCK4_COMPILED@) \
X(ELPA_2STAGE_REAL_AVX2_BLOCK6, 14, @ELPA_2STAGE_REAL_AVX2_BLOCK6_COMPILED@) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK2, 15, @ELPA_2STAGE_REAL_AVX512_BLOCK2_COMPILED@) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK4, 16, @ELPA_2STAGE_REAL_AVX512_BLOCK4_COMPILED@) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK6, 17, @ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED@) \
X(ELPA_2STAGE_REAL_GPU, 18, @ELPA_2STAGE_REAL_GPU_COMPILED@)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X, ...) \
X(ELPA_2STAGE_REAL_GENERIC, 1, @ELPA_2STAGE_REAL_GENERIC_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE, 2, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_BGP, 3, @ELPA_2STAGE_REAL_BGP_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_BGQ, 4, @ELPA_2STAGE_REAL_BGQ_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SSE_ASSEMBLY, 5, @ELPA_2STAGE_REAL_SSE_ASSEMBLY_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SSE_BLOCK2, 6, @ELPA_2STAGE_REAL_SSE_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SSE_BLOCK4, 7, @ELPA_2STAGE_REAL_SSE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SSE_BLOCK6, 8, @ELPA_2STAGE_REAL_SSE_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX_BLOCK2, 9, @ELPA_2STAGE_REAL_AVX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX_BLOCK4, 10, @ELPA_2STAGE_REAL_AVX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX_BLOCK6, 11, @ELPA_2STAGE_REAL_AVX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX2_BLOCK2, 12, @ELPA_2STAGE_REAL_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX2_BLOCK4, 13, @ELPA_2STAGE_REAL_AVX2_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX2_BLOCK6, 14, @ELPA_2STAGE_REAL_AVX2_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK2, 15, @ELPA_2STAGE_REAL_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK4, 16, @ELPA_2STAGE_REAL_AVX512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK6, 17, @ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GPU, 18, @ELPA_2STAGE_REAL_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......@@ -52,21 +52,21 @@ enum ELPA_REAL_KERNELS {
};
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
X(ELPA_2STAGE_COMPLEX_GENERIC, 1, @ELPA_2STAGE_COMPLEX_GENERIC_COMPILED@) \
X(ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE, 2, @ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE_COMPILED@) \
X(ELPA_2STAGE_COMPLEX_BGP, 3, @ELPA_2STAGE_COMPLEX_BGP_COMPILED@) \
X(ELPA_2STAGE_COMPLEX_BGQ, 4, @ELPA_2STAGE_COMPLEX_BGQ_COMPILED@) \
X(ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY, 5, @ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY_COMPILED@) \
X(ELPA_2STAGE_COMPLEX_SSE_BLOCK1, 6, @ELPA_2STAGE_COMPLEX_SSE_BLOCK1_COMPILED@) \
X(ELPA_2STAGE_COMPLEX_SSE_BLOCK2, 7, @ELPA_2STAGE_COMPLEX_SSE_BLOCK2_COMPILED@) \
X(ELPA_2STAGE_COMPLEX_AVX_BLOCK1, 8, @ELPA_2STAGE_COMPLEX_AVX_BLOCK1_COMPILED@) \
X(ELPA_2STAGE_COMPLEX_AVX_BLOCK2, 9, @ELPA_2STAGE_COMPLEX_AVX_BLOCK2_COMPILED@) \
X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK1, 10, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK1_COMPILED@) \
X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@) \
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X, ...) \
X(ELPA_2STAGE_COMPLEX_GENERIC, 1, @ELPA_2STAGE_COMPLEX_GENERIC_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE, 2, @ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_BGP, 3, @ELPA_2STAGE_COMPLEX_BGP_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_BGQ, 4, @ELPA_2STAGE_COMPLEX_BGQ_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY, 5, @ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SSE_BLOCK1, 6, @ELPA_2STAGE_COMPLEX_SSE_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SSE_BLOCK2, 7, @ELPA_2STAGE_COMPLEX_SSE_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX_BLOCK1, 8, @ELPA_2STAGE_COMPLEX_AVX_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX_BLOCK2, 9, @ELPA_2STAGE_COMPLEX_AVX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK1, 10, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
......
......@@ -23,35 +23,38 @@ for g, p, d, s in product(sorted(gpu_flag.keys()),
sorted(prec_flag.keys()),
sorted(domain_flag.keys()),
sorted(solver_flag.keys())):
endifs = 0
extra_flags = []
if (g == 1):
print("if WITH_GPU_VERSION")
endifs += 1
if s == "2stage":
extra_flags.append("-DTEST_KERNEL=ELPA_2STAGE_{0}_GPU".format(d.upper()))
elif s == "2stage":
extra_flags.append("-DTEST_ALL_KERNELS")
if (p == "single"):
if (d == "real"):
print("if WANT_SINGLE_PRECISION_REAL")
elif (d == "complex"):
print("if WANT_SINGLE_PRECISION_COMPLEX")
else:
raise Exception("Oh no!")
endifs += 1
for kernel in ["all_kernels", "default_kernel"] if s == "2stage" else ["nokernel"]:
endifs = 0
extra_flags = []
if (g == 1):
print("if WITH_GPU_VERSION")
endifs += 1
name = "test_{0}_{1}_{2}{3}".format(d, p, s, "_gpu" if g else "")
print("noinst_PROGRAMS += " + name)
print("check_SCRIPTS += " + name + ".sh")
print(name + "_SOURCES = test/Fortran/test.F90")
print(name + "_LDADD = $(build_lib)")
print(name + "_FCFLAGS = $(AM_FCFLAGS) $(FC_MODINC)test_modules $(FC_MODINC)modules \\")
print(" " + " \\\n ".join([
domain_flag[d],
prec_flag[p],
solver_flag[s],
gpu_flag[g]] + extra_flags))
if kernel == "default_kernel":
extra_flags.append("-DTEST_KERNEL=ELPA_2STAGE_{0}_DEFAULT".format(d.upper()))
elif kernel == "all_kernels":
extra_flags.append("-DTEST_ALL_KERNELS")
print("endif\n" * endifs)
if (p == "single"):
if (d == "real"):
print("if WANT_SINGLE_PRECISION_REAL")
elif (d == "complex"):
print("if WANT_SINGLE_PRECISION_COMPLEX")
else:
raise Exception("Oh no!")
endifs += 1
name = "test_{0}_{1}_{2}{3}{4}".format(d, p, s, "" if kernel == "nokernel" else "_" + kernel, "_gpu" if g else "")
print("noinst_PROGRAMS += " + name)
print("check_SCRIPTS += " + name + ".sh")
print(name + "_SOURCES = test/Fortran/test.F90")
print(name + "_LDADD = $(build_lib)")
print(name + "_FCFLAGS = $(AM_FCFLAGS) $(FC_MODINC)test_modules $(FC_MODINC)modules \\")
print(" " + " \\\n ".join([
domain_flag[d],
prec_flag[p],
solver_flag[s],
gpu_flag[g]] + extra_flags))
print("endif\n" * endifs)
......@@ -493,9 +493,9 @@ static int enumerate_identity(int i) {
case value: \
return 1;
#define VALID_CASE_3(name, value, available) \
#define VALID_CASE_3(name, value, available, other_checks) \
case value: \
return available;
return available && (other_checks(value));
static const char* elpa_solver_name(int solver) {
switch(solver) {
......@@ -562,9 +562,13 @@ static const char *real_kernel_name(int kernel) {
}
}
#define REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_REAL_GPU ? gpu_is_active : 1
static int real_kernel_is_valid(elpa_index_t index, int n, int new_value) {
int gpu_is_active = elpa_index_get_int_value(index, "gpu", NULL);
switch(new_value) {
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(VALID_CASE_3)
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(VALID_CASE_3, REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
default:
return 0;
}
......@@ -593,9 +597,13 @@ static const char *complex_kernel_name(int kernel) {
}
}
#define COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_COMPLEX_GPU ? gpu_is_active : 1
static int complex_kernel_is_valid(elpa_index_t index, int n, int new_value) {
int gpu_is_active = elpa_index_get_int_value(index, "gpu", NULL);
switch(new_value) {
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(VALID_CASE_3)
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(VALID_CASE_3, COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
default:
return 0;
}
......
......@@ -210,13 +210,6 @@ program test
#ifdef TEST_ALL_KERNELS
do i = 0, elpa_option_cardinality(KERNEL_KEY)
kernel = elpa_option_enumerate(KERNEL_KEY, i)
#ifdef TEST_REAL
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
#else
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
#endif
call e%set("gpu",1)
endif
#endif /* TEST_ALL_KERNELS */
#ifdef TEST_KERNEL
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment