diff --git a/COMPILE b/COMPILE index 8a5f3cd3c04629b80586a01b4c0fa1af9c823130..5b1c5b2c81c05d8acbfa73b9328637e97b3351d4 100644 --- a/COMPILE +++ b/COMPILE @@ -28,14 +28,14 @@ to the C99 standard, you should still be able to compile libsharp with Runtime CPU selection with gcc ------------------------------ -When using a recent gcc (6.0 and newer) on an x86_64 platform, the build -machinery will compile the time-critical functions for several different -architectures (SSE2, AVX, AVX2, FMA3, FMA4, AVX512F), and the appropriate -implementation will be selected at runtime. -This only happens if you do _not_ explicitly specify a target architecture via -the compiler flags. I.e., please do _not_ specify "-march=native" or -"-mtarget=avx" or similar if you want a portable binary that will run -efficiently on different x86_64 CPUs. +When using a recent gcc (6.0 and newer) or a recent clang (successfully tested +with versions 6 and 7) on an x86_64 platform, the build machinery can compile +the time-critical functions for several different architectures (SSE2, AVX, +AVX2, FMA3, FMA4, AVX512F), and the appropriate implementation will be selected +at runtime. +This is enabled by passing "-DMULTIARCH" as part of the CFLAGS. +If this is enabled, please do _not_ specify "-march=native" or +"-mtarget=avx" or similar! If you are compiling libsharp for a particular target CPU only, or if you are using a different compiler, however, "-march-native" should be used. The resulting binary will most likely not run on other computers, though. @@ -65,16 +65,16 @@ Example configure invocations ============================= GCC, OpenMP, portable binary: -CFLAGS="-std=c99 -O3 -ffast-math -fopenmp" ./configure +CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math -fopenmp" ./configure GCC, no OpenMP, portable binary: -CFLAGS="-std=c99 -O3 -ffast-math" ./configure +CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math" ./configure -Clang, OpenMP, nonportable binary: -CC=clang CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp" ./configure +Clang, OpenMP, portable binary: +CC=clang CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math -fopenmp" ./configure Intel C compiler, OpenMP, nonportable binary: -CC=icc CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp" ./configure +CC=icc CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp -D__PURE_INTEL_C99_HEADERS__" ./configure MPI support, nonportable binary: CC=mpicc CFLAGS="-DUSE_MPI -std=c99 -O3 -march=native -ffast-math" ./configure diff --git a/Makefile.am b/Makefile.am index bcf53ff1c0fd1a5ead966eedfe9f3c41a57de45e..b0b09ee3a7b393f3cd653dfdaa43cad392765b0d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -2,7 +2,7 @@ ACLOCAL_AMFLAGS = -I m4 lib_LTLIBRARIES = libsharp.la -src_sharp = \ +libsharp_la_SOURCES = \ c_utils/c_utils.c \ c_utils/c_utils.h \ pocketfft/pocketfft.c \ @@ -10,11 +10,6 @@ src_sharp = \ libsharp/sharp.c \ libsharp/sharp_almhelpers.c \ libsharp/sharp_core.c \ - libsharp/sharp_core_avx.c \ - libsharp/sharp_core_avx2.c \ - libsharp/sharp_core_fma.c \ - libsharp/sharp_core_fma4.c \ - libsharp/sharp_core_avx512f.c \ libsharp/sharp_geomhelpers.c \ libsharp/sharp_legendre_roots.c \ libsharp/sharp_ylmgen_c.c \ @@ -23,6 +18,16 @@ src_sharp = \ libsharp/sharp_vecsupport.h \ libsharp/sharp_ylmgen_c.h +libavx_la_SOURCES = libsharp/sharp_core_inc.c +libavx2_la_SOURCES = libsharp/sharp_core_inc.c +libfma_la_SOURCES = libsharp/sharp_core_inc.c +libfma4_la_SOURCES = libsharp/sharp_core_inc.c +libavx512f_la_SOURCES = libsharp/sharp_core_inc.c + +noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la + +libsharp_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la + include_HEADERS = \ libsharp/sharp.h \ libsharp/sharp_geomhelpers.h \ @@ -30,11 +35,8 @@ include_HEADERS = \ libsharp/sharp_cxx.h EXTRA_DIST = \ - libsharp/sharp_core_inc.c \ runtest.sh -libsharp_la_SOURCES = $(src_sharp) - check_PROGRAMS = sharp_testsuite sharp_testsuite_SOURCES = libsharp/sharp_testsuite.c c_utils/memusage.c c_utils/memusage.h c_utils/walltime_c.c c_utils/walltime_c.h sharp_testsuite_LDADD = libsharp.la @@ -43,6 +45,12 @@ TESTS = runtest.sh AM_CFLAGS = -I$(top_srcdir)/c_utils -I$(top_srcdir)/libsharp @AM_CFLAGS@ +libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx +libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2 +libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma +libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4 +libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f + pkgconfigdir = $(libdir)/pkgconfig nodist_pkgconfig_DATA = @PACKAGE_NAME@.pc diff --git a/README.md b/README.md index c993dd168ff9bdb63d82a6e728ff85863de4c0e5..e135b44e05c9fe71096ed834daab7daf59a39fb9 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ personal communication). These improvements reduce the fraction of CPU time spent on evaluating the recurrences for Y_lm coefficients, which means that computing multiple -simultaneous SHTs no longer have a big performance advantage compared to SHTs +simultaneous SHTs no longer has a big performance advantage compared to SHTs done one after the other. As a consequence, libsharp support for simultaneous SHTs was dropped, making its interface much simpler. diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c index 036a8ed674fdd3287242ba07aa6edc382e848562..f54a058b0e72e80a389bc420d3f59f3c4c463c7f 100644 --- a/libsharp/sharp_core.c +++ b/libsharp/sharp_core.c @@ -1,9 +1,7 @@ -#define XCONCATX(a,b) a##_##b -#define XCONCATX2(a,b) XCONCATX(a,b) -#define XARCH(a) XCONCATX2(a,ARCH) - #define ARCH default +#define GENERIC_ARCH #include "sharp_core_inc.c" +#undef GENERIC_ARCH #undef ARCH typedef void (*t_inner_loop) (sharp_job *job, const int *ispair, @@ -18,7 +16,12 @@ static t_veclen veclen_ = NULL; static t_max_nvec max_nvec_ = NULL; static t_architecture architecture_ = NULL; -#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) +#ifdef MULTIARCH + +#if (defined(___AVX512F__) || defined(__FMA4__) || defined(__FMA__) || \ + defined(__AVX2__) || defined(__AVX__)) +#error MULTIARCH specified but platform-specific flags detected +#endif #define DECL(arch) \ static int XCONCATX2(have,arch)(void) \ @@ -39,27 +42,17 @@ int XCONCATX2(sharp_veclen,arch) (void); \ int XCONCATX2(sharp_max_nvec,arch) (int spin); \ const char *XCONCATX2(sharp_architecture,arch) (void); -#if (!defined(__AVX512F__)) DECL(avx512f) -#endif -#if (!defined(__FMA4__)) DECL(fma4) -#endif -#if (!defined(__FMA__)) DECL(fma) -#endif -#if (!defined(__AVX2__)) DECL(avx2) -#endif -#if (!defined(__AVX__)) DECL(avx) -#endif #endif static void assign_funcs(void) { -#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) +#ifdef MULTIARCH #define DECL2(arch) \ if (XCONCATX2(have,arch)()) \ { \ @@ -69,21 +62,11 @@ static void assign_funcs(void) architecture_ = XCONCATX2(sharp_architecture,arch); \ return; \ } -#if (!defined(__AVX512F__)) DECL2(avx512f) -#endif -#if (!defined(__FMA4__)) DECL2(fma4) -#endif -#if (!defined(__FMA__)) DECL2(fma) -#endif -#if (!defined(__AVX2__)) DECL2(avx2) -#endif -#if (!defined(__AVX__)) DECL2(avx) -#endif #endif inner_loop_ = inner_loop_default; veclen_ = sharp_veclen_default; diff --git a/libsharp/sharp_core_avx.c b/libsharp/sharp_core_avx.c deleted file mode 100644 index 724e629fb6ca6f7ed3f22def3cf5e2c177544cc3..0000000000000000000000000000000000000000 --- a/libsharp/sharp_core_avx.c +++ /dev/null @@ -1,11 +0,0 @@ -#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) - -#define XCONCATX(a,b) a##_##b -#define XCONCATX2(a,b) XCONCATX(a,b) -#define XARCH(a) XCONCATX2(a,ARCH) - -#define ARCH avx -#pragma GCC target("avx") -#include "sharp_core_inc.c" - -#endif diff --git a/libsharp/sharp_core_avx2.c b/libsharp/sharp_core_avx2.c deleted file mode 100644 index a7ab0a762ee7878777de9f14aeb717207209fbd2..0000000000000000000000000000000000000000 --- a/libsharp/sharp_core_avx2.c +++ /dev/null @@ -1,11 +0,0 @@ -#if (!defined(__AVX2__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) - -#define XCONCATX(a,b) a##_##b -#define XCONCATX2(a,b) XCONCATX(a,b) -#define XARCH(a) XCONCATX2(a,ARCH) - -#define ARCH avx2 -#pragma GCC target("avx2") -#include "sharp_core_inc.c" - -#endif diff --git a/libsharp/sharp_core_avx512f.c b/libsharp/sharp_core_avx512f.c deleted file mode 100644 index 7f1742925a733a311df65ed70bff06f2585ce8f6..0000000000000000000000000000000000000000 --- a/libsharp/sharp_core_avx512f.c +++ /dev/null @@ -1,11 +0,0 @@ -#if (!defined(__AVX512F__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) - -#define XCONCATX(a,b) a##_##b -#define XCONCATX2(a,b) XCONCATX(a,b) -#define XARCH(a) XCONCATX2(a,ARCH) - -#define ARCH avx512f -#pragma GCC target("avx512f") -#include "sharp_core_inc.c" - -#endif diff --git a/libsharp/sharp_core_fma.c b/libsharp/sharp_core_fma.c deleted file mode 100644 index 793151f5b29feed48fa5dce1c400b52e1d74d70e..0000000000000000000000000000000000000000 --- a/libsharp/sharp_core_fma.c +++ /dev/null @@ -1,11 +0,0 @@ -#if (!defined(__FMA__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) - -#define XCONCATX(a,b) a##_##b -#define XCONCATX2(a,b) XCONCATX(a,b) -#define XARCH(a) XCONCATX2(a,ARCH) - -#define ARCH fma -#pragma GCC target("fma") -#include "sharp_core_inc.c" - -#endif diff --git a/libsharp/sharp_core_fma4.c b/libsharp/sharp_core_fma4.c deleted file mode 100644 index d71de74d916bcff4306d60b173594779a137fab6..0000000000000000000000000000000000000000 --- a/libsharp/sharp_core_fma4.c +++ /dev/null @@ -1,11 +0,0 @@ -#if (!defined(__FMA4__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6) - -#define XCONCATX(a,b) a##_##b -#define XCONCATX2(a,b) XCONCATX(a,b) -#define XARCH(a) XCONCATX2(a,ARCH) - -#define ARCH fma4 -#pragma GCC target("fma4") -#include "sharp_core_inc.c" - -#endif diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c index d229a49c733813e54f789a78f4c547f57361a1c1..96981f605421e2ed4eb09bb183f58a43d71d0ec2 100644 --- a/libsharp/sharp_core_inc.c +++ b/libsharp/sharp_core_inc.c @@ -29,6 +29,12 @@ * \author Martin Reinecke */ +#if (defined(MULTIARCH) || defined(GENERIC_ARCH)) + +#define XCONCATX(a,b) a##_##b +#define XCONCATX2(a,b) XCONCATX(a,b) +#define XARCH(a) XCONCATX2(a,ARCH) + #include <complex.h> #include <math.h> #include <string.h> @@ -1179,3 +1185,5 @@ const char *XARCH(sharp_architecture)(void) { return xstr(ARCH); } + +#endif