diff --git a/COMPILE b/COMPILE
index 8a5f3cd3c04629b80586a01b4c0fa1af9c823130..5b1c5b2c81c05d8acbfa73b9328637e97b3351d4 100644
--- a/COMPILE
+++ b/COMPILE
@@ -28,14 +28,14 @@ to the C99 standard, you should still be able to compile libsharp with
 Runtime CPU selection with gcc
 ------------------------------
 
-When using a recent gcc (6.0 and newer) on an x86_64 platform, the build
-machinery will compile the time-critical functions for several different
-architectures (SSE2, AVX, AVX2, FMA3, FMA4, AVX512F), and the appropriate
-implementation will be selected at runtime.
-This only happens if you do _not_ explicitly specify a target architecture via
-the compiler flags. I.e., please do _not_ specify "-march=native" or
-"-mtarget=avx" or similar if you want a portable binary that will run
-efficiently on different x86_64 CPUs.
+When using a recent gcc (6.0 and newer) or a recent clang (successfully tested
+with versions 6 and 7) on an x86_64 platform, the build machinery can compile
+the time-critical functions for several different architectures (SSE2, AVX,
+AVX2, FMA3, FMA4, AVX512F), and the appropriate implementation will be selected
+at runtime.
+This is enabled by passing "-DMULTIARCH" as part of the CFLAGS.
+If this is enabled, please do _not_ specify "-march=native" or
+"-mtarget=avx" or similar!
 If you are compiling libsharp for a particular target CPU only, or if you are
 using a different compiler, however, "-march-native" should be used. The
 resulting binary will most likely not run on other computers, though.
@@ -65,16 +65,16 @@ Example configure invocations
 =============================
 
 GCC, OpenMP, portable binary:
-CFLAGS="-std=c99 -O3 -ffast-math -fopenmp" ./configure
+CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math -fopenmp" ./configure
 
 GCC, no OpenMP, portable binary:
-CFLAGS="-std=c99 -O3 -ffast-math" ./configure
+CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math" ./configure
 
-Clang, OpenMP, nonportable binary:
-CC=clang CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp" ./configure
+Clang, OpenMP, portable binary:
+CC=clang CFLAGS="-DMULTIARCH -std=c99 -O3 -ffast-math -fopenmp" ./configure
 
 Intel C compiler, OpenMP, nonportable binary:
-CC=icc CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp" ./configure
+CC=icc CFLAGS="-std=c99 -O3 -march=native -ffast-math -fopenmp -D__PURE_INTEL_C99_HEADERS__" ./configure
 
 MPI support, nonportable binary:
 CC=mpicc CFLAGS="-DUSE_MPI -std=c99 -O3 -march=native -ffast-math" ./configure
diff --git a/Makefile.am b/Makefile.am
index bcf53ff1c0fd1a5ead966eedfe9f3c41a57de45e..b0b09ee3a7b393f3cd653dfdaa43cad392765b0d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -2,7 +2,7 @@ ACLOCAL_AMFLAGS = -I m4
 
 lib_LTLIBRARIES = libsharp.la
 
-src_sharp = \
+libsharp_la_SOURCES = \
   c_utils/c_utils.c \
   c_utils/c_utils.h \
   pocketfft/pocketfft.c \
@@ -10,11 +10,6 @@ src_sharp = \
   libsharp/sharp.c \
   libsharp/sharp_almhelpers.c \
   libsharp/sharp_core.c \
-  libsharp/sharp_core_avx.c \
-  libsharp/sharp_core_avx2.c \
-  libsharp/sharp_core_fma.c \
-  libsharp/sharp_core_fma4.c \
-  libsharp/sharp_core_avx512f.c \
   libsharp/sharp_geomhelpers.c \
   libsharp/sharp_legendre_roots.c \
   libsharp/sharp_ylmgen_c.c \
@@ -23,6 +18,16 @@ src_sharp = \
   libsharp/sharp_vecsupport.h \
   libsharp/sharp_ylmgen_c.h
 
+libavx_la_SOURCES = libsharp/sharp_core_inc.c
+libavx2_la_SOURCES = libsharp/sharp_core_inc.c
+libfma_la_SOURCES = libsharp/sharp_core_inc.c
+libfma4_la_SOURCES = libsharp/sharp_core_inc.c
+libavx512f_la_SOURCES = libsharp/sharp_core_inc.c
+
+noinst_LTLIBRARIES = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
+
+libsharp_la_LIBADD = libavx.la libavx2.la libfma.la libfma4.la libavx512f.la
+
 include_HEADERS = \
   libsharp/sharp.h \
   libsharp/sharp_geomhelpers.h \
@@ -30,11 +35,8 @@ include_HEADERS = \
   libsharp/sharp_cxx.h
 
 EXTRA_DIST = \
-  libsharp/sharp_core_inc.c \
   runtest.sh
 
-libsharp_la_SOURCES = $(src_sharp)
-
 check_PROGRAMS = sharp_testsuite
 sharp_testsuite_SOURCES = libsharp/sharp_testsuite.c c_utils/memusage.c c_utils/memusage.h c_utils/walltime_c.c c_utils/walltime_c.h
 sharp_testsuite_LDADD = libsharp.la
@@ -43,6 +45,12 @@ TESTS = runtest.sh
 
 AM_CFLAGS = -I$(top_srcdir)/c_utils -I$(top_srcdir)/libsharp @AM_CFLAGS@
 
+libavx_la_CFLAGS = ${AM_CFLAGS} -mavx -DARCH=avx
+libavx2_la_CFLAGS = ${AM_CFLAGS} -mavx2 -DARCH=avx2
+libfma_la_CFLAGS = ${AM_CFLAGS} -mfma -DARCH=fma
+libfma4_la_CFLAGS = ${AM_CFLAGS} -mfma4 -DARCH=fma4
+libavx512f_la_CFLAGS = ${AM_CFLAGS} -mavx512f -DARCH=avx512f
+
 pkgconfigdir = $(libdir)/pkgconfig
 nodist_pkgconfig_DATA = @PACKAGE_NAME@.pc
 
diff --git a/README.md b/README.md
index c993dd168ff9bdb63d82a6e728ff85863de4c0e5..e135b44e05c9fe71096ed834daab7daf59a39fb9 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ personal communication).
 
 These improvements reduce the fraction of CPU time spent on evaluating the
 recurrences for Y_lm coefficients, which means that computing multiple
-simultaneous SHTs no longer have a big performance advantage compared to SHTs
+simultaneous SHTs no longer has a big performance advantage compared to SHTs
 done one after the other.
 As a consequence, libsharp support for simultaneous SHTs was dropped, making
 its interface much simpler.
diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 036a8ed674fdd3287242ba07aa6edc382e848562..f54a058b0e72e80a389bc420d3f59f3c4c463c7f 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -1,9 +1,7 @@
-#define XCONCATX(a,b) a##_##b
-#define XCONCATX2(a,b) XCONCATX(a,b)
-#define XARCH(a) XCONCATX2(a,ARCH)
-
 #define ARCH default
+#define GENERIC_ARCH
 #include "sharp_core_inc.c"
+#undef GENERIC_ARCH
 #undef ARCH
 
 typedef void (*t_inner_loop) (sharp_job *job, const int *ispair,
@@ -18,7 +16,12 @@ static t_veclen veclen_ = NULL;
 static t_max_nvec max_nvec_ = NULL;
 static t_architecture architecture_ = NULL;
 
-#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+#ifdef MULTIARCH
+
+#if (defined(___AVX512F__) || defined(__FMA4__) || defined(__FMA__) || \
+     defined(__AVX2__) || defined(__AVX__))
+#error MULTIARCH specified but platform-specific flags detected
+#endif
 
 #define DECL(arch) \
 static int XCONCATX2(have,arch)(void) \
@@ -39,27 +42,17 @@ int XCONCATX2(sharp_veclen,arch) (void); \
 int XCONCATX2(sharp_max_nvec,arch) (int spin); \
 const char *XCONCATX2(sharp_architecture,arch) (void);
 
-#if (!defined(__AVX512F__))
 DECL(avx512f)
-#endif
-#if (!defined(__FMA4__))
 DECL(fma4)
-#endif
-#if (!defined(__FMA__))
 DECL(fma)
-#endif
-#if (!defined(__AVX2__))
 DECL(avx2)
-#endif
-#if (!defined(__AVX__))
 DECL(avx)
-#endif
 
 #endif
 
 static void assign_funcs(void)
   {
-#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
+#ifdef MULTIARCH
 #define DECL2(arch) \
   if (XCONCATX2(have,arch)()) \
     { \
@@ -69,21 +62,11 @@ static void assign_funcs(void)
     architecture_ = XCONCATX2(sharp_architecture,arch); \
     return; \
     }
-#if (!defined(__AVX512F__))
 DECL2(avx512f)
-#endif
-#if (!defined(__FMA4__))
 DECL2(fma4)
-#endif
-#if (!defined(__FMA__))
 DECL2(fma)
-#endif
-#if (!defined(__AVX2__))
 DECL2(avx2)
-#endif
-#if (!defined(__AVX__))
 DECL2(avx)
-#endif
 #endif
   inner_loop_ = inner_loop_default;
   veclen_ = sharp_veclen_default;
diff --git a/libsharp/sharp_core_avx.c b/libsharp/sharp_core_avx.c
deleted file mode 100644
index 724e629fb6ca6f7ed3f22def3cf5e2c177544cc3..0000000000000000000000000000000000000000
--- a/libsharp/sharp_core_avx.c
+++ /dev/null
@@ -1,11 +0,0 @@
-#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
-
-#define XCONCATX(a,b) a##_##b
-#define XCONCATX2(a,b) XCONCATX(a,b)
-#define XARCH(a) XCONCATX2(a,ARCH)
-
-#define ARCH avx
-#pragma GCC target("avx")
-#include "sharp_core_inc.c"
-
-#endif
diff --git a/libsharp/sharp_core_avx2.c b/libsharp/sharp_core_avx2.c
deleted file mode 100644
index a7ab0a762ee7878777de9f14aeb717207209fbd2..0000000000000000000000000000000000000000
--- a/libsharp/sharp_core_avx2.c
+++ /dev/null
@@ -1,11 +0,0 @@
-#if (!defined(__AVX2__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
-
-#define XCONCATX(a,b) a##_##b
-#define XCONCATX2(a,b) XCONCATX(a,b)
-#define XARCH(a) XCONCATX2(a,ARCH)
-
-#define ARCH avx2
-#pragma GCC target("avx2")
-#include "sharp_core_inc.c"
-
-#endif
diff --git a/libsharp/sharp_core_avx512f.c b/libsharp/sharp_core_avx512f.c
deleted file mode 100644
index 7f1742925a733a311df65ed70bff06f2585ce8f6..0000000000000000000000000000000000000000
--- a/libsharp/sharp_core_avx512f.c
+++ /dev/null
@@ -1,11 +0,0 @@
-#if (!defined(__AVX512F__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
-
-#define XCONCATX(a,b) a##_##b
-#define XCONCATX2(a,b) XCONCATX(a,b)
-#define XARCH(a) XCONCATX2(a,ARCH)
-
-#define ARCH avx512f
-#pragma GCC target("avx512f")
-#include "sharp_core_inc.c"
-
-#endif
diff --git a/libsharp/sharp_core_fma.c b/libsharp/sharp_core_fma.c
deleted file mode 100644
index 793151f5b29feed48fa5dce1c400b52e1d74d70e..0000000000000000000000000000000000000000
--- a/libsharp/sharp_core_fma.c
+++ /dev/null
@@ -1,11 +0,0 @@
-#if (!defined(__FMA__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
-
-#define XCONCATX(a,b) a##_##b
-#define XCONCATX2(a,b) XCONCATX(a,b)
-#define XARCH(a) XCONCATX2(a,ARCH)
-
-#define ARCH fma
-#pragma GCC target("fma")
-#include "sharp_core_inc.c"
-
-#endif
diff --git a/libsharp/sharp_core_fma4.c b/libsharp/sharp_core_fma4.c
deleted file mode 100644
index d71de74d916bcff4306d60b173594779a137fab6..0000000000000000000000000000000000000000
--- a/libsharp/sharp_core_fma4.c
+++ /dev/null
@@ -1,11 +0,0 @@
-#if (!defined(__FMA4__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
-
-#define XCONCATX(a,b) a##_##b
-#define XCONCATX2(a,b) XCONCATX(a,b)
-#define XARCH(a) XCONCATX2(a,ARCH)
-
-#define ARCH fma4
-#pragma GCC target("fma4")
-#include "sharp_core_inc.c"
-
-#endif
diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c
index d229a49c733813e54f789a78f4c547f57361a1c1..96981f605421e2ed4eb09bb183f58a43d71d0ec2 100644
--- a/libsharp/sharp_core_inc.c
+++ b/libsharp/sharp_core_inc.c
@@ -29,6 +29,12 @@
  *  \author Martin Reinecke
  */
 
+#if (defined(MULTIARCH) || defined(GENERIC_ARCH))
+
+#define XCONCATX(a,b) a##_##b
+#define XCONCATX2(a,b) XCONCATX(a,b)
+#define XARCH(a) XCONCATX2(a,ARCH)
+
 #include <complex.h>
 #include <math.h>
 #include <string.h>
@@ -1179,3 +1185,5 @@ const char *XARCH(sharp_architecture)(void)
   {
   return xstr(ARCH);
   }
+
+#endif