From 4d0b0ab10aae8a8284019e2fe8ff011671c7f88e Mon Sep 17 00:00:00 2001 From: Andreas Marek Date: Wed, 29 May 2019 11:26:33 +0200 Subject: [PATCH] Start to implement real NEON ARCH64 kernels --- Doxyfile.in | 12 + Makefile.am | 21 + configure.ac | 37 +- elpa/elpa_constants.h.in | 11 +- src/elpa2/compute_hh_trafo.F90 | 197 +++ ...real_128bit_256bit_512bit_BLOCK_template.c | 1137 ++++++++++------- .../real_neon_arch64_2hv_double_precision.c | 59 + .../real_neon_arch64_2hv_single_precision.c | 59 + .../real_neon_arch64_4hv_double_precision.c | 59 + .../real_neon_arch64_4hv_single_precision.c | 59 + .../real_neon_arch64_6hv_double_precision.c | 59 + .../real_neon_arch64_6hv_single_precision.c | 59 + 12 files changed, 1265 insertions(+), 504 deletions(-) create mode 100644 src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c create mode 100644 src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c create mode 100644 src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c create mode 100644 src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c create mode 100644 src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c create mode 100644 src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c diff --git a/Doxyfile.in b/Doxyfile.in index 2771d035..ecd342e9 100644 --- a/Doxyfile.in +++ b/Doxyfile.in @@ -908,10 +908,14 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \ @top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_sse_6hv_single_precision.c \ + @top_srcdir@/src/elpa2/kernels/real_sparc64_6hv_single_precision.c \ + @top_srcdir@/src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx512_4hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/asm_x86_64_double_precision.s \ @top_srcdir@/src/elpa2/kernels/real_avx512_4hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_sse_6hv_double_precision.c \ + @top_srcdir@/src/elpa2/kernels/real_sparc64_6hv_double_precision.c \ + @top_srcdir@/src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/asm_x86_64_single_precision.s \ @top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c \ @@ -929,19 +933,27 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \ @top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_bgq.f90 \ @top_srcdir@/src/elpa2/kernels/real_sse_2hv_single_precision.c \ + @top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_single_precision.c \ + @top_srcdir@/src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_template.c \ @top_srcdir@/src/elpa2/kernels/real_simple.F90 \ @top_srcdir@/src/elpa2/kernels/complex.F90 \ @top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_template.c \ @top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_template.c \ @top_srcdir@/src/elpa2/kernels/real_sse_2hv_double_precision.c \ + @top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_double_precision.c \ + @top_srcdir@/src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_sse_2hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_bgp.f90 \ @top_srcdir@/src/elpa2/kernels/real_sse_4hv_double_precision.c \ + @top_srcdir@/src/elpa2/kernels/real_sparc64_4hv_double_precision.c \ + @top_srcdir@/src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx512_6hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_sse_2hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx512_6hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_sse_4hv_single_precision.c \ + @top_srcdir@/src/elpa2/kernels/real_sparc64_4hv_single_precision.c \ + @top_srcdir@/src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c \ @top_srcdir@/src/elpa2/elpa2_compute_complex_template.F90 \ @top_srcdir@/src/elpa2/elpa2_bandred_template.F90 \ @top_srcdir@/src/elpa2/pack_unpack_gpu.F90 \ diff --git a/Makefile.am b/Makefile.am index 5c4cbbe1..1e9393cd 100644 --- a/Makefile.am +++ b/Makefile.am @@ -227,6 +227,13 @@ if WITH_REAL_SPARC64_BLOCK2_KERNEL #endif endif +if WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL + libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c +if WANT_SINGLE_PRECISION_REAL + libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c +endif +endif + if WITH_REAL_VSX_BLOCK2_KERNEL libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_2hv_double_precision.c if WANT_SINGLE_PRECISION_REAL @@ -269,6 +276,13 @@ if WITH_REAL_SPARC64_BLOCK4_KERNEL #endif endif +if WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL + libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c +if WANT_SINGLE_PRECISION_REAL + libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c +endif +endif + if WITH_REAL_VSX_BLOCK4_KERNEL libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c if WANT_SINGLE_PRECISION_REAL @@ -311,6 +325,13 @@ if WITH_REAL_SPARC64_BLOCK6_KERNEL #endif endif +if WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL + libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c +if WANT_SINGLE_PRECISION_REAL + libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c +endif +endif + if WITH_REAL_VSX_BLOCK6_KERNEL libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_double_precision.c if WANT_SINGLE_PRECISION_REAL diff --git a/configure.ac b/configure.ac index 6e87de1b..93344392 100644 --- a/configure.ac +++ b/configure.ac @@ -636,6 +636,12 @@ m4_define(elpa_m4_sparc64_kernels, [ real_sparc64_block6 ]) +m4_define(elpa_m4_neon_arch64_kernels, [ + real_neon_arch64_block2 + real_neon_arch64_block4 + real_neon_arch64_block6 +]) + m4_define(elpa_m4_vsx_kernels, [ real_vsx_block2 real_vsx_block4 @@ -681,7 +687,7 @@ m4_define(elpa_m4_gpu_kernels, [ complex_gpu ]) -m4_define(elpa_m4_kernel_types, [generic sparc64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu]) +m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu]) m4_define(elpa_m4_all_kernels, m4_foreach_w([elpa_m4_type], @@ -715,6 +721,7 @@ AC_DEFUN([ELPA_SELECT_KERNELS], [ dnl Modify list of kernels with configure arguments ELPA_SELECT_KERNELS([generic],[enable]) ELPA_SELECT_KERNELS([sparc64],[disable]) +ELPA_SELECT_KERNELS([neon_arch64],[disable]) ELPA_SELECT_KERNELS([vsx],[disable]) ELPA_SELECT_KERNELS([sse],[enable]) ELPA_SELECT_KERNELS([sse_assembly],[enable]) @@ -730,7 +737,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[ ]) if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then - m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [ + m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [ if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then echo "Disabling elpa_m4_kernel due to BGP/BGQ option" fi @@ -790,7 +797,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[ ]) fi ]) -m4_foreach_w([elpa_m4_arch],[sparc64 vsx sse avx avx2 avx512],[ +m4_foreach_w([elpa_m4_arch],[sparc64 neon_arch64 vsx sse avx avx2 avx512],[ ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2]) ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2]) ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1]) @@ -848,7 +855,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[ m4_foreach_w([elpa_m4_kind],[real complex],[ m4_foreach_w([elpa_m4_kernel], m4_foreach_w([elpa_m4_cand_kernel], - elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels, + elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels, [m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ), [ if test -z "$default_[]elpa_m4_kind[]_kernel"; then @@ -895,7 +902,6 @@ int main(int argc, char **argv) { AC_DEFINE([HAVE_VSX_SSE],[1],[Altivec VSX intrinsics are supported on this CPU]) fi - if test x"${need_sparc64}" = x"yes"; then AC_MSG_CHECKING(whether we can compile SPARC64 with intrinsics in C) AC_COMPILE_IFELSE([AC_LANG_SOURCE([ @@ -917,6 +923,27 @@ int main(int argc, char **argv) { AC_DEFINE([HAVE_SPARC64_SSE],[1],[SPARC64 intrinsics are supported on this CPU]) fi +if test x"${need_neon_arch64}" = x"yes"; then + AC_MSG_CHECKING(whether we can compile NEON ARCH64 with intrinsics in C) + AC_COMPILE_IFELSE([AC_LANG_SOURCE([ +#include +int main(int argc, char **argv) { + __Float64x2_t x1, x2, x3, x4; + x4 = vfmaq_64(x1, x2, x3); + return 0; +} + ])], + [can_compile_neon_arch64=yes], + [can_compile_neon_arch64=no] + ) + AC_MSG_RESULT([${can_compile_neon_arch64}]) + if test x"$can_compile_neon_arch64" != x"yes"; then + AC_MSG_ERROR([Could not compile test program, try with --disable-neon_arch64, or adjust the C compiler or CFLAGS]) + fi + AC_DEFINE([HAVE_NEON_ARCH64_SSE],[1],[NEON_ARCH64 intrinsics are supported on this CPU]) +fi + + if test x"${need_sse}" = x"yes"; then AC_MSG_CHECKING(whether we can compile SSE3 with gcc intrinsics in C) AC_COMPILE_IFELSE([AC_LANG_SOURCE([ diff --git a/elpa/elpa_constants.h.in b/elpa/elpa_constants.h.in index 3da6dd21..0af4f210 100644 --- a/elpa/elpa_constants.h.in +++ b/elpa/elpa_constants.h.in @@ -44,10 +44,13 @@ enum ELPA_SOLVERS { X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \ - X(ELPA_2STAGE_REAL_VSX_BLOCK2, 22, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \ - X(ELPA_2STAGE_REAL_VSX_BLOCK4, 23, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \ - X(ELPA_2STAGE_REAL_VSX_BLOCK6, 24, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \ - X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 25, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) + X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2, 22, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \ + X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4, 23, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED@, __VA_ARGS__) \ + X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6, 24, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED@, __VA_ARGS__) \ + X(ELPA_2STAGE_REAL_VSX_BLOCK2, 25, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \ + X(ELPA_2STAGE_REAL_VSX_BLOCK4, 26, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \ + X(ELPA_2STAGE_REAL_VSX_BLOCK6, 27, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \ + X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 28, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) #define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \ ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \ diff --git a/src/elpa2/compute_hh_trafo.F90 b/src/elpa2/compute_hh_trafo.F90 index 82948482..a8f1f8f7 100644 --- a/src/elpa2/compute_hh_trafo.F90 +++ b/src/elpa2/compute_hh_trafo.F90 @@ -339,6 +339,7 @@ kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. & + kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_GENERIC .or. & kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. & @@ -850,6 +851,43 @@ #endif /* REALCASE == 1 */ +#if REALCASE == 1 +! implementation of neon_arch64 block 2 real case +#if defined(WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL) + +#ifndef WITH_FIXED_REAL_KERNEL + if (kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2) then + +#endif /* not WITH_FIXED_REAL_KERNEL */ + +#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL)) + do j = ncols, 2, -2 + w(:,1) = bcast_buffer(1:nbw,j+off) + w(:,2) = bcast_buffer(1:nbw,j+off-1) +#ifdef WITH_OPENMP + call double_hh_trafo_& + &MATH_DATATYPE& + &_neon_arch64_2hv_& + &PRECISION & + & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw) +#else + call double_hh_trafo_& + &MATH_DATATYPE& + &_neon_arch64_2hv_& + &PRECISION & + & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw) +#endif + enddo +#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL)) */ + +#ifndef WITH_FIXED_REAL_KERNEL + endif +#endif /* not WITH_FIXED_REAL_KERNEL */ +#endif /* WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL */ + +#endif /* REALCASE == 1 */ + + #if REALCASE == 1 ! implementation of vsx block 2 real case @@ -1656,6 +1694,77 @@ #endif /* REALCASE */ +#if REALCASE == 1 +! neon_arch64 block 4 real kernel + +#if defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL) +#ifndef WITH_FIXED_REAL_KERNEL + if (kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4) then + +#endif /* not WITH_FIXED_REAL_KERNEL */ + +#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL)) + do j = ncols, 4, -4 + w(:,1) = bcast_buffer(1:nbw,j+off) + w(:,2) = bcast_buffer(1:nbw,j+off-1) + w(:,3) = bcast_buffer(1:nbw,j+off-2) + w(:,4) = bcast_buffer(1:nbw,j+off-3) +#ifdef WITH_OPENMP + call quad_hh_trafo_& + &MATH_DATATYPE& + &_neon_arch64_4hv_& + &PRECISION& + & (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw) +#else + call quad_hh_trafo_& + &MATH_DATATYPE& + &_neon_arch64_4hv_& + &PRECISION& + & (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw) +#endif + enddo + do jj = j, 2, -2 + w(:,1) = bcast_buffer(1:nbw,jj+off) + w(:,2) = bcast_buffer(1:nbw,jj+off-1) +#ifdef WITH_OPENMP + call double_hh_trafo_& + &MATH_DATATYPE& + &_neon_arch64_2hv_& + &PRECISION& + & (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw) +#else + call double_hh_trafo_& + &MATH_DATATYPE& + &_neon_arch64_2hv_& + &PRECISION& + & (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw) +#endif + enddo +#ifdef WITH_OPENMP + if (jj==1) call single_hh_trafo_& + &MATH_DATATYPE& + &_cpu_openmp_& + &PRECISION& + & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), & + bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) +#else + if (jj==1) call single_hh_trafo_& + &MATH_DATATYPE& + &_cpu_& + &PRECISION& + & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) +#endif + +#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL)) */ + +#ifndef WITH_FIXED_REAL_KERNEL + endif +#endif /* not WITH_FIXED_REAL_KERNEL */ +#endif /* WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL */ + +#endif /* REALCASE */ + + #if REALCASE == 1 ! vsx block4 real kernel @@ -2040,6 +2149,94 @@ #endif /* REALCASE */ +#if REALCASE == 1 +!neon_arch64 block6 real kernel +#if defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) +#ifndef WITH_FIXED_REAL_KERNEL + if (kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6) then + +#endif /* not WITH_FIXED_REAL_KERNEL */ + ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS + do j = ncols, 6, -6 + w(:,1) = bcast_buffer(1:nbw,j+off) + w(:,2) = bcast_buffer(1:nbw,j+off-1) + w(:,3) = bcast_buffer(1:nbw,j+off-2) + w(:,4) = bcast_buffer(1:nbw,j+off-3) + w(:,5) = bcast_buffer(1:nbw,j+off-4) + w(:,6) = bcast_buffer(1:nbw,j+off-5) +#ifdef WITH_OPENMP + call hexa_hh_trafo_& + &MATH_DATATYPE& + &_neon_arch64_6hv_& + &PRECISION& + & (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw) +#else + call hexa_hh_trafo_& + &MATH_DATATYPE& + &_neon_arch64_6hv_& + &PRECISION& + & (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw) +#endif + enddo + do jj = j, 4, -4 + w(:,1) = bcast_buffer(1:nbw,jj+off) + w(:,2) = bcast_buffer(1:nbw,jj+off-1) + w(:,3) = bcast_buffer(1:nbw,jj+off-2) + w(:,4) = bcast_buffer(1:nbw,jj+off-3) +#ifdef WITH_OPENMP + call quad_hh_trafo_& + &MATH_DATATYPE& + &_neon_arch64_4hv_& + &PRECISION& + & (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw) +#else + call quad_hh_trafo_& + &MATH_DATATYPE& + &neon_arch64_4hv_& + &PRECISION& + & (c_loc(a(1,jj+off+a_off-3,istripe)), w, & + nbw, nl, stripe_width, nbw) +#endif + enddo + do jjj = jj, 2, -2 + w(:,1) = bcast_buffer(1:nbw,jjj+off) + w(:,2) = bcast_buffer(1:nbw,jjj+off-1) +#ifdef WITH_OPENMP + call double_hh_trafo_& + &MATH_DATATYPE& + &_neon_arch64_2hv_& + &PRECISION& + & (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw) +#else + call double_hh_trafo_& + &MATH_DATATYPE& + &_neon_arch64_2hv_& + &PRECISION& + & (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw) +#endif + enddo +#ifdef WITH_OPENMP + if (jjj==1) call single_hh_trafo_& + &MATH_DATATYPE& + &_cpu_openmp_& + &PRECISION& + & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), & + bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) +#else + if (jjj==1) call single_hh_trafo_& + &MATH_DATATYPE& + &_cpu_& + &PRECISION& + & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) +#endif +#ifndef WITH_FIXED_REAL_KERNEL + endif +#endif /* not WITH_FIXED_REAL_KERNEL */ +#endif /* WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL */ + +#endif /* REALCASE */ + + #if REALCASE == 1 !vsx block6 real kernel #if defined(WITH_REAL_VSX_BLOCK6_KERNEL) diff --git a/src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c b/src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c index a619d3dc..09052320 100644 --- a/src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c +++ b/src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c @@ -65,17 +65,28 @@ #define CONCAT_3ARGS(a, b, c) CONCAT2_3ARGS(a, b, c) #define CONCAT2_3ARGS(a, b, c) a ## b ## c +//define instruction set numbers +#define NEON_ARCH64_128 1285 + + + #if VEC_SET == 128 || VEC_SET == 256 || VEC_SET == 512 #include #endif + #if VEC_SET == 1281 #include #include #endif + #if VEC_SET == 1282 #include #endif +#if VEC_SET == NEON_ARCH64_128 +#include +#endif + #include #include @@ -106,6 +117,10 @@ #define SIMD_SET VSX #endif +#if VEC_SET == NEON_ARCH64_128 +#define SIMD_SET NEON_ARCH64 +#endif + #if VEC_SET == 256 #define SIMD_SET AVX_AVX2 #endif @@ -129,6 +144,10 @@ #if VEC_SET == 128 #define _SIMD_SET _mm_set_pd #define _SIMD_SET1 _mm_set1_pd +#define _SIMD_NEG 1 +#endif +#if VEC_SET == 1281 +#define _SIMD_NEG _fjsp_neg_v2r8 #endif #endif /* DOUBLE_PRECISION_REAL */ #ifdef SINGLE_PRECISION_REAL @@ -143,7 +162,11 @@ #if VEC_SET == 128 #define _SIMD_SET _mm_set_ps #define _SIMD_SET1 _mm_set1_ps +#define _SIMD_NEG 1 #endif +#if VEC_SET == 1281 +#define _SIMD_NEG 1 +#endif #endif /* SINGLE_PRECISION_REAL */ #endif /* VEC_SET == 128 || VEC_SET == 1281 */ @@ -161,6 +184,7 @@ #define _SIMD_LOAD (__vector float) vec_ld #endif +#define _SIMD_NEG 1 #define _SIMD_STORE vec_st #define _SIMD_ADD vec_add #define _SIMD_MUL vec_mul @@ -168,6 +192,35 @@ #endif /* VEC_SET == 1281 */ +#if VEC_SET == NEON_ARCH64_128 +#ifdef DOUBLE_PRECISION_REAL +#define offset 2 +#define __SIMD_DATATYPE __Float64x2_t +#define _SIMD_LOAD vld1q_f64 +#define _SIMD_STORE vst1q_f64 +#define _SIMD_ADD vaddq_f64 +#define _SIMD_MUL vmulq_f64 +#define _SIMD_SUB vsubq_f64 +#define _SIMD_NEG vnegq_f64 +// FMA +//#define _SIMD_XOR _mm_xor_pd +#define _SIMD_SET1 vdupq_n_f64 +#endif /* DOUBLE_PRECISION_REAL */ +#ifdef SINGLE_PRECISION_REAL +#define offset 4 +#define __SIMD_DATATYPE __Float32x4_t +#define _SIMD_LOAD vld1q_f32 +#define _SIMD_STORE vst1q_f32 +#define _SIMD_ADD vaddq_f32 +#define _SIMD_MUL vmulq_f32 +#define _SIMD_SUB vsubq_f32 +#define _SIMD_NEG vnegq_f32 +//FMA +//#define _SIMD_XOR _mm_xor_ps +#define _SIMD_SET1 vdupq_n_f32 +#endif /* SINGLE_PRECISION_REAL */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 */ + #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL #define offset 4 @@ -180,6 +233,7 @@ #define _SIMD_SET1 _mm256_set1_pd #define _SIMD_XOR _mm256_xor_pd #define _SIMD_BROADCAST _mm256_broadcast_sd +#define _SIMD_NEG 1 #ifdef HAVE_AVX2 #ifdef __FMA4__ #define __ELPA_USE_FMA__ @@ -213,6 +267,7 @@ #define _SIMD_SET1 _mm256_set1_ps #define _SIMD_XOR _mm256_xor_ps #define _SIMD_BROADCAST _mm256_broadcast_ss +#define _SIMD_NEG 1 #ifdef HAVE_AVX2 #ifdef __FMA4__ #define __ELPA_USE_FMA__ @@ -247,6 +302,7 @@ #define _SIMD_MUL _mm512_mul_pd #define _SIMD_SUB _mm512_sub_pd #define _SIMD_SET1 _mm512_set1_pd +#define _SIMD_NEG 1 #ifdef HAVE_AVX512_XEON #define _SIMD_XOR _mm512_xor_pd #endif @@ -273,6 +329,7 @@ #define _SIMD_MUL _mm512_mul_ps #define _SIMD_SUB _mm512_sub_ps #define _SIMD_SET1 _mm512_set1_ps +#define _SIMD_NEG 1 #ifdef HAVE_AVX512_XEON #define _SIMD_XOR _mm512_xor_ps #endif @@ -305,7 +362,7 @@ #undef __AVX__ #endif -#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 256 || VEC_SET == 512 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 || VEC_SET == 256 || VEC_SET == 512 #undef _LOAD #undef _STORE #undef _XOR @@ -324,7 +381,7 @@ #endif -#if VEC_SET == 128 || VEC_SET == 1281 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 //Forward declaration #ifdef DOUBLE_PRECISION_REAL #undef ROW_LENGTH @@ -334,7 +391,7 @@ #undef ROW_LENGTH #define ROW_LENGTH 4 #endif -#endif /* VEC_SET == 128 || VEC_SET == 1281 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 */ #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL @@ -368,7 +425,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h DATA_TYPE_PTR scalarprods); #endif -#if VEC_SET == 128 || VEC_SET == 1281 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 #ifdef DOUBLE_PRECISION_REAL #undef ROW_LENGTH #define ROW_LENGTH 4 @@ -377,7 +434,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #undef ROW_LENGTH #define ROW_LENGTH 8 #endif -#endif /* VEC_SET == 128 || VEC_SET == 1281 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 */ #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL @@ -411,7 +468,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h DATA_TYPE_PTR scalarprods); #endif -#if VEC_SET == 128 || VEC_SET == 1281 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 #ifdef DOUBLE_PRECISION_REAL #undef ROW_LENGTH #define ROW_LENGTH 6 @@ -420,7 +477,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #undef ROW_LENGTH #define ROW_LENGTH 12 #endif -#endif /* VEC_SET == 128 || VEC_SET == 1281 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 */ #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL @@ -455,7 +512,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h DATA_TYPE_PTR scalarprods); #endif -#if VEC_SET == 128 || VEC_SET == 1281 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 #ifdef DOUBLE_PRECISION_REAL #undef ROW_LENGTH #define ROW_LENGTH 8 @@ -464,7 +521,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #undef ROW_LENGTH #define ROW_LENGTH 16 #endif -#endif /* VEC_SET == 128 || VEC_SET == 1281 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 */ #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL @@ -499,7 +556,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h DATA_TYPE_PTR scalarprods); #endif -#if VEC_SET == 128 || VEC_SET == 1281 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 #ifdef DOUBLE_PRECISION_REAL #undef ROW_LENGTH #define ROW_LENGTH 10 @@ -508,7 +565,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #undef ROW_LENGTH #define ROW_LENGTH 20 #endif -#endif /* VEC_SET == 128 || VEC_SET == 1281 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 */ #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL @@ -543,7 +600,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h DATA_TYPE_PTR scalarprods); #endif -#if VEC_SET == 128 || VEC_SET == 1281 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 #ifdef DOUBLE_PRECISION_REAL #undef ROW_LENGTH #define ROW_LENGTH 12 @@ -552,7 +609,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #undef ROW_LENGTH #define ROW_LENGTH 24 #endif -#endif /* VEC_SET == 128 || VEC_SET == 1281 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 */ #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL @@ -644,6 +701,35 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA !f>#endif */ +/* +!f>#ifdef HAVE_NEON_ARCH64_SSE +!f> interface +!f> subroutine double_hh_trafo_real_NEON_ARCH64_2hv_double(q, hh, pnb, pnq, pldq, pldh) & +!f> bind(C, name="double_hh_trafo_real_NEON_ARCH64_2hv_double") +!f> use, intrinsic :: iso_c_binding +!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh +!f> type(c_ptr), value :: q +!f> real(kind=c_double) :: hh(pnb,6) +!f> end subroutine +!f> end interface +!f>#endif +*/ + +/* +!f>#ifdef HAVE_NEON_ARCH64_SSE +!f> interface +!f> subroutine double_hh_trafo_real_NEON_ARCH64_2hv_single(q, hh, pnb, pnq, pldq, pldh) & +!f> bind(C, name="double_hh_trafo_real_NEON_ARCH64_2hv_single") +!f> use, intrinsic :: iso_c_binding +!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh +!f> type(c_ptr), value :: q +!f> real(kind=c_float) :: hh(pnb,6) +!f> end subroutine +!f> end interface +!f>#endif +*/ + + /* !f>#ifdef HAVE_VSX_SSE !f> interface @@ -783,6 +869,35 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA !f>#endif */ +/* +!f>#ifdef HAVE_NEON_ARCH64_SSE +!f> interface +!f> subroutine quad_hh_trafo_real_NEON_ARCH64_4hv_double(q, hh, pnb, pnq, pldq, pldh) & +!f> bind(C, name="quad_hh_trafo_real_NEON_ARCH64_4hv_double") +!f> use, intrinsic :: iso_c_binding +!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh +!f> type(c_ptr), value :: q +!f> real(kind=c_double) :: hh(pnb,6) +!f> end subroutine +!f> end interface +!f>#endif +*/ + +/* +!f>#ifdef HAVE_NEON_ARCH64_SSE +!f> interface +!f> subroutine quad_hh_trafo_real_NEON_ARCH64_4hv_single(q, hh, pnb, pnq, pldq, pldh) & +!f> bind(C, name="quad_hh_trafo_real_NEON_ARCH64_4hv_single") +!f> use, intrinsic :: iso_c_binding +!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh +!f> type(c_ptr), value :: q +!f> real(kind=c_float) :: hh(pnb,6) +!f> end subroutine +!f> end interface +!f>#endif +*/ + + /* !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2) !f> interface @@ -863,6 +978,22 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA !f> end interface !f>#endif */ + +/* +!f>#ifdef HAVE_NEON_ARCH64_SSE +!f> interface +!f> subroutine hexa_hh_trafo_real_NEON64_6hv_double(q, hh, pnb, pnq, pldq, pldh) & +!f> bind(C, name="hexa_hh_trafo_real_NEON_ARCH64_6hv_double") +!f> use, intrinsic :: iso_c_binding +!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh +!f> type(c_ptr), value :: q +!f> real(kind=c_double) :: hh(pnb,6) +!f> end subroutine +!f> end interface +!f>#endif +*/ + + /* !f>#ifdef HAVE_SSE_INTRINSICS !f> interface @@ -890,6 +1021,21 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA !f>#endif */ +/* +!f>#ifdef HAVE_NEON_ARCH64_SSE +!f> interface +!f> subroutine hexa_hh_trafo_real_NEON_ARCH64_6hv_single(q, hh, pnb, pnq, pldq, pldh) & +!f> bind(C, name="hexa_hh_trafo_real_NEON_ARCH64_6hv_single") +!f> use, intrinsic :: iso_c_binding +!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh +!f> type(c_ptr), value :: q +!f> real(kind=c_float) :: hh(pnb,6) +!f> end subroutine +!f> end interface +!f>#endif +*/ + + /* !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2) !f> interface @@ -1111,7 +1257,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA // Production level kernel calls with padding #ifdef BLOCK2 -#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 #ifdef DOUBLE_PRECISION_REAL #define STEP_SIZE 12 #define ROW_LENGTH 12 @@ -1122,7 +1268,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA #define ROW_LENGTH 24 #define UPPER_BOUND 20 #endif -#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 */ #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL @@ -1163,14 +1309,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA } #undef ROW_LENGTH -#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 #ifdef DOUBLE_PRECISION_REAL #define ROW_LENGTH 10 #endif #ifdef SINGLE_PRECISION_REAL #define ROW_LENGTH 20 #endif -#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 */ #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL @@ -1197,14 +1343,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA } #undef ROW_LENGTH -#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 #ifdef DOUBLE_PRECISION_REAL #define ROW_LENGTH 8 #endif #ifdef SINGLE_PRECISION_REAL #define ROW_LENGTH 16 #endif -#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 */ #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL @@ -1232,14 +1378,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA } #undef ROW_LENGTH -#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 #ifdef DOUBLE_PRECISION_REAL #define ROW_LENGTH 6 #endif #ifdef SINGLE_PRECISION_REAL #define ROW_LENGTH 12 #endif -#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 */ #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL @@ -1264,17 +1410,17 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA worked_on += ROW_LENGTH; } -#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 256 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 || VEC_SET == 256 #undef ROW_LENGTH -#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 #ifdef DOUBLE_PRECISION_REAL #define ROW_LENGTH 4 #endif #ifdef SINGLE_PRECISION_REAL #define ROW_LENGTH 8 #endif -#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 */ #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL @@ -1293,14 +1439,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA } #undef ROW_LENGTH -#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 #ifdef DOUBLE_PRECISION_REAL #define ROW_LENGTH 2 #endif #ifdef SINGLE_PRECISION_REAL #define ROW_LENGTH 4 #endif -#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 */ #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL @@ -1317,7 +1463,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA worked_on += ROW_LENGTH; } -#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 || VEC_SET == 256 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 || VEC_SET == 256 */ #endif /* BLOCK2 */ @@ -1325,7 +1471,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA #undef ROW_LENGTH -#if VEC_SET == 128 || VEC_SET == 1281 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 #ifdef DOUBLE_PRECISION_REAL #define ROW_LENGTH 6 #define STEP_SIZE 6 @@ -1336,7 +1482,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA #define STEP_SIZE 12 #define UPPER_BOUND 8 #endif -#endif /* VEC_SET == 128 || VEC_SET == 1281 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 */ #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL @@ -1376,14 +1522,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA #undef ROW_LENGTH -#if VEC_SET == 128 || VEC_SET == 1281 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 #ifdef DOUBLE_PRECISION_REAL #define ROW_LENGTH 4 #endif #ifdef SINGLE_PRECISION_REAL #define ROW_LENGTH 8 #endif -#endif /* VEC_SET == 128 || VEC_SET == 1281 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 */ #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL @@ -1411,14 +1557,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA } #undef ROW_LENGTH -#if VEC_SET == 128 || VEC_SET == 1281 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 #ifdef DOUBLE_PRECISION_REAL #define ROW_LENGTH 2 #endif #ifdef SINGLE_PRECISION_REAL #define ROW_LENGTH 4 #endif -#endif /* VEC_SET == 128 || VEC_SET == 1281 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 */ #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL @@ -1468,7 +1614,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA #ifdef BLOCK6 #undef ROW_LENGTH -#if VEC_SET == 128 || VEC_SET == 1281 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 #ifdef DOUBLE_PRECISION_REAL #define ROW_LENGTH 4 #define STEP_SIZE 4 @@ -1479,7 +1625,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA #define STEP_SIZE 8 #define UPPER_BOUND 4 #endif -#endif /* VEC_SET == 128 || VEC_SET == 1281 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 */ #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL @@ -1518,14 +1664,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA } #undef ROW_LENGTH -#if VEC_SET == 128 || VEC_SET == 1281 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 #ifdef DOUBLE_PRECISION_REAL #define ROW_LENGTH 2 #endif #ifdef SINGLE_PRECISION_REAL #define ROW_LENGTH 4 #endif -#endif /* VEC_SET == 128 || VEC_SET == 1281 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 */ #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL @@ -1598,14 +1744,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA } #undef ROW_LENGTH -#if VEC_SET == 128 || VEC_SET == 1281 +#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 #ifdef DOUBLE_PRECISION_REAL #define ROW_LENGTH 12 #endif #ifdef SINGLE_PRECISION_REAL #define ROW_LENGTH 24 #endif -#endif /* VEC_SET == 128 || VEC_SET == 1281 */ +#endif /* VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 */ #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL @@ -1681,6 +1827,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h __SIMD_DATATYPE sign = vec_spalts(-1.0); #endif + #if VEC_SET == 256 #ifdef DOUBLE_PRECISION_REAL __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set1_epi64x(0x8000000000000000); @@ -1706,7 +1853,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h __SIMD_DATATYPE x5 = _LOAD(&q[ldq+4*offset]); __SIMD_DATATYPE x6 = _LOAD(&q[ldq+5*offset]); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h1 = _SIMD_SET1(hh[ldh+1]); #endif #if VEC_SET == 1281 @@ -1752,7 +1899,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h __SIMD_DATATYPE a3_1 = _LOAD(&q[ldq]); __SIMD_DATATYPE a4_1 = _LOAD(&q[0]); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]); __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]); __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]); @@ -1934,7 +2081,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h __SIMD_DATATYPE a5_1 = _LOAD(&q[ldq]); __SIMD_DATATYPE a6_1 = _LOAD(&q[0]); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_6_5 = _SIMD_SET1(hh[(ldh*5)+1]); __SIMD_DATATYPE h_6_4 = _SIMD_SET1(hh[(ldh*5)+2]); __SIMD_DATATYPE h_6_3 = _SIMD_SET1(hh[(ldh*5)+3]); @@ -1972,7 +2119,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h t1 = _SIMD_ADD(t1, _SIMD_MUL(a1_1, h_6_1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_5_4 = _SIMD_SET1(hh[(ldh*4)+1]); __SIMD_DATATYPE h_5_3 = _SIMD_SET1(hh[(ldh*4)+2]); __SIMD_DATATYPE h_5_2 = _SIMD_SET1(hh[(ldh*4)+3]); @@ -2005,7 +2152,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h v1 = _SIMD_ADD(v1, _SIMD_MUL(a1_1, h_5_1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]); __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]); __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]); @@ -2033,7 +2180,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]); __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]); __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]); @@ -2298,7 +2445,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h for(i = BLOCK; i < nb; i++) { -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[i-(BLOCK-1)]); h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]); #endif @@ -2347,7 +2494,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* __ELPA_USE_FMA__ */ #if defined(BLOCK4) || defined(BLOCK6) -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]); #endif @@ -2375,7 +2522,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h z6 = _SIMD_ADD(z6, _SIMD_MUL(q6,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]); #endif @@ -2407,7 +2554,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+i-1]); #endif #if VEC_SET == 1281 @@ -2433,7 +2580,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h v6 = _SIMD_ADD(v6, _SIMD_MUL(q6,h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+i]); #endif @@ -2464,7 +2611,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK6 */ } -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]); #endif #if VEC_SET == 1281 @@ -2498,7 +2645,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #if defined(BLOCK4) || defined(BLOCK6) -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]); #endif @@ -2526,7 +2673,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]); #endif @@ -2555,7 +2702,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif #ifdef BLOCK4 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-2]); #endif @@ -2590,7 +2737,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[(ldh*1)+nb-1]); #endif @@ -2618,7 +2765,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-1]); #endif @@ -2658,7 +2805,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]); #endif @@ -2686,7 +2833,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h w6 = _SIMD_ADD(w6, _SIMD_MUL(q6,h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]); #endif @@ -2714,7 +2861,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h v6 = _SIMD_ADD(v6, _SIMD_MUL(q6,h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-4]); #endif #if VEC_SET == 1281 @@ -2747,7 +2894,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-3]); #endif #if VEC_SET == 1281 @@ -2773,7 +2920,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]); #endif #if VEC_SET == 1281 @@ -2799,7 +2946,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h z6 = _SIMD_ADD(z6, _SIMD_MUL(q6,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]); #endif #if VEC_SET == 1281 @@ -2825,7 +2972,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h w6 = _SIMD_ADD(w6, _SIMD_MUL(q6,h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-3]); #endif #if VEC_SET == 1281 @@ -2858,7 +3005,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-2]); #endif @@ -2886,7 +3033,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]); #endif @@ -2914,7 +3061,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h z6 = _SIMD_ADD(z6, _SIMD_MUL(q6,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-2]); #endif @@ -2948,7 +3095,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-1]); #endif @@ -2976,7 +3123,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-1]); #endif @@ -3029,7 +3176,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ///////////////////////////////////////////////////// #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE tau1 = _SIMD_SET1(hh[0]); __SIMD_DATATYPE tau2 = _SIMD_SET1(hh[ldh]); @@ -3164,8 +3311,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h h1 = _XOR(tau1, sign); #endif -#if VEC_SET == 1281 - h1 = _fjsp_neg_v2r8(tau1); +#if VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 + h1 = SIMD_NEG(tau1); #endif #if VEC_SET == 512 @@ -3200,8 +3347,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h h1 = _XOR(tau2, sign); #endif -#if VEC_SET == 1281 - h1 = _fjsp_neg_v2r8(tau2); +#if VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 + h1 = _SIMD_NEG(tau2); #endif #if VEC_SET == 512 @@ -3423,7 +3570,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[5*offset],q6); #ifdef BLOCK2 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+1]); #endif #if VEC_SET == 1281 @@ -3463,7 +3610,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK2 */ #ifdef BLOCK4 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+1]); #endif @@ -3505,7 +3652,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[ldq+4*offset],q5); _STORE(&q[ldq+5*offset],q6); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+2]); #endif @@ -3545,7 +3692,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+1]); #endif @@ -3580,7 +3727,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq*2)+4*offset],q5); _STORE(&q[(ldq*2)+5*offset],q6); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+3]); #endif @@ -3622,7 +3769,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+1]); #endif @@ -3650,7 +3797,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(y6, h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+2]); #endif @@ -3689,7 +3836,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+1]); #endif #if VEC_SET == 1281 @@ -3735,7 +3882,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq+4*offset)],q5); _STORE(&q[(ldq+5*offset)],q6); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+1]); #endif #if VEC_SET == 1281 @@ -3774,7 +3921,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+2]); #endif #if VEC_SET == 1281 @@ -3807,7 +3954,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq*2)+4*offset],q5); _STORE(&q[(ldq*2)+5*offset],q6); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+1]); #endif @@ -3848,7 +3995,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+2]); #endif #if VEC_SET == 1281 @@ -3874,7 +4021,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+3]); #endif #if VEC_SET == 1281 @@ -3907,7 +4054,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq*3)+4*offset],q5); _STORE(&q[(ldq*3)+5*offset],q6); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+1]); #endif #if VEC_SET == 1281 @@ -3945,7 +4092,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(z6, h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+2]); #endif #if VEC_SET == 1281 @@ -3971,7 +4118,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+3]); #endif #if VEC_SET == 1281 @@ -3997,7 +4144,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+4]); #endif #if VEC_SET == 1281 @@ -4030,7 +4177,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq*4)+4*offset],q5); _STORE(&q[(ldq*4)+5*offset],q6); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[(ldh)+1]); #endif #if VEC_SET == 1281 @@ -4068,7 +4215,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(y6, h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+2]); #endif #if VEC_SET == 1281 @@ -4094,7 +4241,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(z6, h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+3]); #endif #if VEC_SET == 1281 @@ -4120,7 +4267,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+4]); #endif #if VEC_SET == 1281 @@ -4146,7 +4293,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+5]); #endif #if VEC_SET == 1281 @@ -4183,7 +4330,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h for (i = BLOCK; i < nb; i++) { -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[i-(BLOCK-1)]); h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]); #endif @@ -4261,7 +4408,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(y6,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]); #endif @@ -4289,7 +4436,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(z6,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]); #endif @@ -4320,7 +4467,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK4 || BLOCK6 */ #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+i-1]); #endif #if VEC_SET == 1281 @@ -4347,7 +4494,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+i]); #endif #if VEC_SET == 1281 @@ -4383,7 +4530,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(i*ldq)+5*offset],q6); } -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]); #endif #if VEC_SET == 1281 @@ -4437,7 +4584,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(x6, h1)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]); #endif @@ -4464,7 +4611,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(y5, h2)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]); #endif @@ -4495,7 +4642,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK4 || BLOCK6 */ #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]); #endif #if VEC_SET == 1281 @@ -4521,7 +4668,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]); #endif #if VEC_SET == 1281 @@ -4558,7 +4705,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #if defined(BLOCK4) || defined(BLOCK6) -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-2)]); #endif @@ -4593,7 +4740,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(x6, h1)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-3)]); #endif @@ -4622,7 +4769,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]); #endif #if VEC_SET == 1281 @@ -4648,7 +4795,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(z6, h3)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]); #endif #if VEC_SET == 1281 @@ -4682,7 +4829,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[((nb+1)*ldq)+4*offset],q5); _STORE(&q[((nb+1)*ldq)+5*offset],q6); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-3)]); #endif @@ -4718,7 +4865,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-2]); #endif #if VEC_SET == 1281 @@ -4745,7 +4892,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(y6, h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]); #endif #if VEC_SET == 1281 @@ -4783,7 +4930,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK4 || BLOCK6 */ #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-2]); #endif #if VEC_SET == 1281 @@ -4816,7 +4963,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q6 = _SIMD_SUB(q6, _SIMD_MUL(x6, h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-1]); #endif #if VEC_SET == 1281 @@ -4849,7 +4996,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[((nb+3)*ldq)+4*offset],q5); _STORE(&q[((nb+3)*ldq)+5*offset],q6); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-1]); #endif #if VEC_SET == 1281 @@ -4999,7 +5146,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h __SIMD_DATATYPE x4 = _LOAD(&q[ldq+3*offset]); __SIMD_DATATYPE x5 = _LOAD(&q[ldq+4*offset]); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h1 = _SIMD_SET1(hh[ldh+1]); #endif #if VEC_SET == 1281 @@ -5041,7 +5188,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h __SIMD_DATATYPE a3_1 = _LOAD(&q[ldq]); __SIMD_DATATYPE a4_1 = _LOAD(&q[0]); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]); __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]); __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]); @@ -5199,7 +5346,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h __SIMD_DATATYPE a5_1 = _LOAD(&q[ldq]); __SIMD_DATATYPE a6_1 = _LOAD(&q[0]); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_6_5 = _SIMD_SET1(hh[(ldh*5)+1]); __SIMD_DATATYPE h_6_4 = _SIMD_SET1(hh[(ldh*5)+2]); __SIMD_DATATYPE h_6_3 = _SIMD_SET1(hh[(ldh*5)+3]); @@ -5237,7 +5384,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h t1 = _SIMD_ADD(t1, _SIMD_MUL(a1_1, h_6_1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_5_4 = _SIMD_SET1(hh[(ldh*4)+1]); __SIMD_DATATYPE h_5_3 = _SIMD_SET1(hh[(ldh*4)+2]); __SIMD_DATATYPE h_5_2 = _SIMD_SET1(hh[(ldh*4)+3]); @@ -5271,7 +5418,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h v1 = _SIMD_ADD(v1, _SIMD_MUL(a1_1, h_5_1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]); __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]); __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]); @@ -5299,7 +5446,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]); __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]); __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]); @@ -5520,7 +5667,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h for(i = BLOCK; i < nb; i++) { -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[i-(BLOCK-1)]); h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]); #endif @@ -5563,7 +5710,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif #if defined(BLOCK4) || defined(BLOCK6) -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]); #endif @@ -5589,7 +5736,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h z5 = _SIMD_ADD(z5, _SIMD_MUL(q5,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]); #endif @@ -5619,7 +5766,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+i-1]); #endif #if VEC_SET == 1281 @@ -5643,7 +5790,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h v5 = _SIMD_ADD(v5, _SIMD_MUL(q5,h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+i]); #endif @@ -5672,7 +5819,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK6 */ } -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]); #endif #if VEC_SET == 1281 @@ -5703,7 +5850,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #if defined(BLOCK4) || defined(BLOCK6) -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]); #endif @@ -5729,7 +5876,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]); #endif @@ -5756,7 +5903,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif #ifdef BLOCK4 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-2]); #endif @@ -5788,7 +5935,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[(ldh*1)+nb-1]); #endif @@ -5814,7 +5961,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-1]); #endif @@ -5851,7 +5998,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]); #endif @@ -5877,7 +6024,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h w5 = _SIMD_ADD(w5, _SIMD_MUL(q5,h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]); #endif @@ -5903,7 +6050,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h v5 = _SIMD_ADD(v5, _SIMD_MUL(q5,h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-4]); #endif #if VEC_SET == 1281 @@ -5933,7 +6080,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-3]); #endif #if VEC_SET == 1281 @@ -5957,7 +6104,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]); #endif #if VEC_SET == 1281 @@ -5981,7 +6128,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h z5 = _SIMD_ADD(z5, _SIMD_MUL(q5,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]); #endif #if VEC_SET == 1281 @@ -6005,7 +6152,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h w5 = _SIMD_ADD(w5, _SIMD_MUL(q5,h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-3]); #endif #if VEC_SET == 1281 @@ -6035,7 +6182,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-2]); #endif @@ -6061,7 +6208,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]); #endif @@ -6087,7 +6234,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h z5 = _SIMD_ADD(z5, _SIMD_MUL(q5,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-2]); #endif @@ -6119,7 +6266,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-1]); #endif @@ -6145,7 +6292,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-1]); #endif @@ -6195,7 +6342,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ///////////////////////////////////////////////////// #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE tau1 = _SIMD_SET1(hh[0]); __SIMD_DATATYPE tau2 = _SIMD_SET1(hh[ldh]); @@ -6330,8 +6477,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h h1 = _XOR(tau1, sign); #endif -#if VEC_SET == 1281 - h1 = _fjsp_neg_v2r8(tau1); +#if VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 + h1 = _SIMD_NEG(tau1); #endif #if VEC_SET == 512 @@ -6366,8 +6513,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h h1 = _XOR(tau2, sign); #endif -#if VEC_SET == 1281 - h1 = _fjsp_neg_v2r8(tau2); +#if VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 + h1 = _SIMD_NEG(tau2); #endif #if VEC_SET == 512 @@ -6566,7 +6713,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[4*offset],q5); #ifdef BLOCK2 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+1]); #endif #if VEC_SET == 1281 @@ -6602,7 +6749,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK2 */ #ifdef BLOCK4 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+1]); #endif @@ -6639,7 +6786,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[ldq+3*offset],q4); _STORE(&q[ldq+4*offset],q5); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+2]); #endif @@ -6675,7 +6822,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+1]); #endif @@ -6707,7 +6854,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq*2)+3*offset],q4); _STORE(&q[(ldq*2)+4*offset],q5); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+3]); #endif @@ -6745,7 +6892,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+1]); #endif @@ -6771,7 +6918,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(y5, h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+2]); #endif @@ -6807,7 +6954,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+1]); #endif #if VEC_SET == 1281 @@ -6848,7 +6995,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq+3*offset)],q4); _STORE(&q[(ldq+4*offset)],q5); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+1]); #endif #if VEC_SET == 1281 @@ -6883,7 +7030,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(v5, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+2]); #endif #if VEC_SET == 1281 @@ -6913,7 +7060,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq*2)+3*offset],q4); _STORE(&q[(ldq*2)+4*offset],q5); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+1]); #endif @@ -6950,7 +7097,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+2]); #endif #if VEC_SET == 1281 @@ -6974,7 +7121,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(v5, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+3]); #endif #if VEC_SET == 1281 @@ -7004,7 +7151,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq*3)+3*offset],q4); _STORE(&q[(ldq*3)+4*offset],q5); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+1]); #endif #if VEC_SET == 1281 @@ -7039,7 +7186,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(z5, h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+2]); #endif #if VEC_SET == 1281 @@ -7063,7 +7210,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+3]); #endif #if VEC_SET == 1281 @@ -7087,7 +7234,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(v5, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+4]); #endif #if VEC_SET == 1281 @@ -7117,7 +7264,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq*4)+3*offset],q4); _STORE(&q[(ldq*4)+4*offset],q5); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[(ldh)+1]); #endif #if VEC_SET == 1281 @@ -7151,7 +7298,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(y5, h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+2]); #endif #if VEC_SET == 1281 @@ -7175,7 +7322,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(z5, h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+3]); #endif #if VEC_SET == 1281 @@ -7199,7 +7346,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+4]); #endif #if VEC_SET == 1281 @@ -7223,7 +7370,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(v5, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+5]); #endif #if VEC_SET == 1281 @@ -7257,7 +7404,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h for (i = BLOCK; i < nb; i++) { -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[i-(BLOCK-1)]); h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]); #endif @@ -7327,7 +7474,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(y5,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]); #endif @@ -7353,7 +7500,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(z5,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]); #endif @@ -7382,7 +7529,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK4 || BLOCK6 */ #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+i-1]); #endif #if VEC_SET == 1281 @@ -7407,7 +7554,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(v5, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+i]); #endif #if VEC_SET == 1281 @@ -7441,7 +7588,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(i*ldq)+4*offset],q5); } -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]); #endif #if VEC_SET == 1281 @@ -7490,7 +7637,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(x5, h1)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]); #endif @@ -7516,7 +7663,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(y5, h2)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]); #endif @@ -7545,7 +7692,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK4 || BLOCK6 */ #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]); #endif #if VEC_SET == 1281 @@ -7569,7 +7716,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]); #endif #if VEC_SET == 1281 @@ -7603,7 +7750,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #if defined(BLOCK4) || defined(BLOCK6) -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-2)]); #endif @@ -7635,7 +7782,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(x5, h1)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-3)]); #endif @@ -7662,7 +7809,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]); #endif #if VEC_SET == 1281 @@ -7686,7 +7833,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(z5, h3)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]); #endif #if VEC_SET == 1281 @@ -7717,7 +7864,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[((nb+1)*ldq)+3*offset],q4); _STORE(&q[((nb+1)*ldq)+4*offset],q5); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-3)]); #endif @@ -7750,7 +7897,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-2]); #endif #if VEC_SET == 1281 @@ -7774,7 +7921,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(y5, h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]); #endif #if VEC_SET == 1281 @@ -7809,7 +7956,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK4 || BLOCK6 */ #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-2]); #endif #if VEC_SET == 1281 @@ -7839,7 +7986,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q5 = _SIMD_SUB(q5, _SIMD_MUL(x5, h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-1]); #endif #if VEC_SET == 1281 @@ -7869,7 +8016,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[((nb+3)*ldq)+3*offset],q4); _STORE(&q[((nb+3)*ldq)+4*offset],q5); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-1]); #endif #if VEC_SET == 1281 @@ -8015,7 +8162,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h __SIMD_DATATYPE x3 = _LOAD(&q[ldq+2*offset]); __SIMD_DATATYPE x4 = _LOAD(&q[ldq+3*offset]); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h1 = _SIMD_SET1(hh[ldh+1]); #endif #if VEC_SET == 1281 @@ -8053,7 +8200,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h __SIMD_DATATYPE a3_1 = _LOAD(&q[ldq]); __SIMD_DATATYPE a4_1 = _LOAD(&q[0]); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]); __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]); __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]); @@ -8187,7 +8334,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h __SIMD_DATATYPE a5_1 = _LOAD(&q[ldq]); __SIMD_DATATYPE a6_1 = _LOAD(&q[0]); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_6_5 = _SIMD_SET1(hh[(ldh*5)+1]); __SIMD_DATATYPE h_6_4 = _SIMD_SET1(hh[(ldh*5)+2]); __SIMD_DATATYPE h_6_3 = _SIMD_SET1(hh[(ldh*5)+3]); @@ -8225,7 +8372,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h t1 = _SIMD_ADD(t1, _SIMD_MUL(a1_1, h_6_1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_5_4 = _SIMD_SET1(hh[(ldh*4)+1]); __SIMD_DATATYPE h_5_3 = _SIMD_SET1(hh[(ldh*4)+2]); __SIMD_DATATYPE h_5_2 = _SIMD_SET1(hh[(ldh*4)+3]); @@ -8258,7 +8405,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h v1 = _SIMD_ADD(v1, _SIMD_MUL(a1_1, h_5_1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]); __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]); __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]); @@ -8286,7 +8433,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]); __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]); __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]); @@ -8462,7 +8609,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h for(i = BLOCK; i < nb; i++) { -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[i-(BLOCK-1)]); h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]); #endif @@ -8501,7 +8648,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* __ELPA_USE_FMA__ */ #if defined(BLOCK4) || defined(BLOCK6) -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]); #endif @@ -8525,7 +8672,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h z4 = _SIMD_ADD(z4, _SIMD_MUL(q4,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]); #endif @@ -8552,7 +8699,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK4 || BLOCK6 */ #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+i-(BLOCK-5)]); #endif #if VEC_SET == 1281 @@ -8574,7 +8721,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h v4 = _SIMD_ADD(v4, _SIMD_MUL(q4,h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+i]); #endif @@ -8600,7 +8747,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK6 */ } -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]); #endif #if VEC_SET == 1281 @@ -8629,7 +8776,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #if defined(BLOCK4) || defined(BLOCK6) -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]); #endif @@ -8653,7 +8800,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]); #endif @@ -8678,7 +8825,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif #ifdef BLOCK4 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-2]); #endif @@ -8707,7 +8854,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[(ldh*1)+nb-1]); #endif @@ -8731,7 +8878,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-1]); #endif @@ -8765,7 +8912,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-(BLOCK-4)]); #endif @@ -8789,7 +8936,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h w4 = _SIMD_ADD(w4, _SIMD_MUL(q4,h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+nb-(BLOCK-5)]); #endif @@ -8813,7 +8960,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h v4 = _SIMD_ADD(v4, _SIMD_MUL(q4,h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-2)]); #endif #if VEC_SET == 1281 @@ -8840,7 +8987,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-3)]); #endif #if VEC_SET == 1281 @@ -8862,7 +9009,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-4)]); #endif #if VEC_SET == 1281 @@ -8884,7 +9031,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h z4 = _SIMD_ADD(z4, _SIMD_MUL(q4,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-(BLOCK-5)]); #endif #if VEC_SET == 1281 @@ -8906,7 +9053,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h w4 = _SIMD_ADD(w4, _SIMD_MUL(q4,h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-3)]); #endif #if VEC_SET == 1281 @@ -8933,7 +9080,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-4)]); #endif @@ -8957,7 +9104,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-5)]); #endif @@ -8981,7 +9128,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h z4 = _SIMD_ADD(z4, _SIMD_MUL(q4,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-4)]); #endif @@ -9010,7 +9157,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-5)]); #endif @@ -9034,7 +9181,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-5)]); #endif @@ -9081,7 +9228,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ///////////////////////////////////////////////////// #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE tau1 = _SIMD_SET1(hh[0]); __SIMD_DATATYPE tau2 = _SIMD_SET1(hh[ldh]); @@ -9216,8 +9363,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h h1 = _XOR(tau1, sign); #endif -#if VEC_SET == 1281 - h1 = _fjsp_neg_v2r8(tau1); +#if VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 + h1 = _SIMD_NEG(tau1); #endif #if VEC_SET == 512 @@ -9251,8 +9398,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h h1 = _XOR(tau2, sign); #endif -#if VEC_SET == 1281 - h1 = _fjsp_neg_v2r8(tau2); +#if VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 + h1 = _SIMD_NEG(tau2); #endif #if VEC_SET == 512 @@ -9413,7 +9560,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[3*offset],q4); #ifdef BLOCK2 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+1]); #endif #if VEC_SET == 1281 @@ -9447,7 +9594,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK2 */ #ifdef BLOCK4 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+1]); #endif @@ -9480,7 +9627,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[ldq+2*offset],q3); _STORE(&q[ldq+3*offset],q4); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+2]); #endif @@ -9514,7 +9661,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+1]); #endif @@ -9543,7 +9690,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq*2)+2*offset],q3); _STORE(&q[(ldq*2)+3*offset],q4); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+3]); #endif @@ -9577,7 +9724,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+1]); #endif @@ -9601,7 +9748,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+2]); #endif @@ -9634,7 +9781,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+1]); #endif #if VEC_SET == 1281 @@ -9671,7 +9818,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq+2*offset)],q3); _STORE(&q[(ldq+3*offset)],q4); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+1]); #endif #if VEC_SET == 1281 @@ -9702,7 +9849,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+2]); #endif #if VEC_SET == 1281 @@ -9729,7 +9876,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq*2)+2*offset],q3); _STORE(&q[(ldq*2)+3*offset],q4); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+1]); #endif @@ -9763,7 +9910,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+2]); #endif #if VEC_SET == 1281 @@ -9785,7 +9932,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+3]); #endif #if VEC_SET == 1281 @@ -9812,7 +9959,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq*3)+2*offset],q3); _STORE(&q[(ldq*3)+3*offset],q4); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+1]); #endif #if VEC_SET == 1281 @@ -9844,7 +9991,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+2]); #endif #if VEC_SET == 1281 @@ -9866,7 +10013,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+3]); #endif #if VEC_SET == 1281 @@ -9888,7 +10035,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+4]); #endif #if VEC_SET == 1281 @@ -9915,7 +10062,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq*4)+2*offset],q3); _STORE(&q[(ldq*4)+3*offset],q4); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[(ldh)+1]); #endif #if VEC_SET == 1281 @@ -9946,7 +10093,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+2]); #endif #if VEC_SET == 1281 @@ -9968,7 +10115,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+3]); #endif #if VEC_SET == 1281 @@ -9990,7 +10137,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+4]); #endif #if VEC_SET == 1281 @@ -10012,7 +10159,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+5]); #endif #if VEC_SET == 1281 @@ -10043,7 +10190,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h for (i = BLOCK; i < nb; i++) { -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[i-(BLOCK-1)]); h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]); #endif @@ -10105,7 +10252,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(y4,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]); #endif @@ -10129,7 +10276,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(z4,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]); #endif @@ -10156,7 +10303,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK4 || BLOCK6*/ #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+i-1]); #endif #if VEC_SET == 1281 @@ -10178,7 +10325,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+i]); #endif #if VEC_SET == 1281 @@ -10208,7 +10355,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(i*ldq)+3*offset],q4); } -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]); #endif #if VEC_SET == 1281 @@ -10252,7 +10399,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(x4, h1)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]); #endif @@ -10276,7 +10423,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]); #endif @@ -10303,7 +10450,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK4 || BLOCK6 */ #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]); #endif #if VEC_SET == 1281 @@ -10325,7 +10472,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]); #endif #if VEC_SET == 1281 @@ -10356,7 +10503,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #if defined(BLOCK4) || defined(BLOCK6) -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-2)]); #endif @@ -10385,7 +10532,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(x4, h1)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-3)]); #endif @@ -10410,7 +10557,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]); #endif #if VEC_SET == 1281 @@ -10432,7 +10579,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]); #endif #if VEC_SET == 1281 @@ -10460,7 +10607,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[((nb+1)*ldq)+2*offset],q3); _STORE(&q[((nb+1)*ldq)+3*offset],q4); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-3)]); #endif @@ -10490,7 +10637,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-2]); #endif #if VEC_SET == 1281 @@ -10512,7 +10659,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]); #endif #if VEC_SET == 1281 @@ -10544,7 +10691,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-2]); #endif #if VEC_SET == 1281 @@ -10571,7 +10718,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q4 = _SIMD_SUB(q4, _SIMD_MUL(x4, h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-1]); #endif #if VEC_SET == 1281 @@ -10598,7 +10745,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[((nb+3)*ldq)+2*offset],q3); _STORE(&q[((nb+3)*ldq)+3*offset],q4); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-1]); #endif #if VEC_SET == 1281 @@ -10738,7 +10885,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h __SIMD_DATATYPE x2 = _LOAD(&q[ldq+offset]); __SIMD_DATATYPE x3 = _LOAD(&q[ldq+2*offset]); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h1 = _SIMD_SET1(hh[ldh+1]); #endif #if VEC_SET == 1281 @@ -10770,7 +10917,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h __SIMD_DATATYPE a3_1 = _LOAD(&q[ldq]); __SIMD_DATATYPE a4_1 = _LOAD(&q[0]); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]); __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]); __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]); @@ -10880,7 +11027,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h __SIMD_DATATYPE a5_1 = _LOAD(&q[ldq]); __SIMD_DATATYPE a6_1 = _LOAD(&q[0]); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_6_5 = _SIMD_SET1(hh[(ldh*5)+1]); __SIMD_DATATYPE h_6_4 = _SIMD_SET1(hh[(ldh*5)+2]); __SIMD_DATATYPE h_6_3 = _SIMD_SET1(hh[(ldh*5)+3]); @@ -10918,7 +11065,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h t1 = _SIMD_ADD(t1, _SIMD_MUL(a1_1, h_6_1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_5_4 = _SIMD_SET1(hh[(ldh*4)+1]); __SIMD_DATATYPE h_5_3 = _SIMD_SET1(hh[(ldh*4)+2]); __SIMD_DATATYPE h_5_2 = _SIMD_SET1(hh[(ldh*4)+3]); @@ -10951,7 +11098,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h v1 = _SIMD_ADD(v1, _SIMD_MUL(a1_1, h_5_1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]); __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]); __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]); @@ -10979,7 +11126,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]); __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]); __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]); @@ -11111,7 +11258,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h for(i = BLOCK; i < nb; i++) { -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[i-(BLOCK-1)]); h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]); #endif @@ -11145,7 +11292,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* __ELPA_USE_FMA__ */ #if defined(BLOCK4) || defined(BLOCK6) -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]); #endif @@ -11167,7 +11314,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]); #endif @@ -11193,7 +11340,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+i-1]); #endif #if VEC_SET == 1281 @@ -11213,7 +11360,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h v3 = _SIMD_ADD(v3, _SIMD_MUL(q3,h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+i-(BLOCK-6)]); #endif @@ -11237,7 +11384,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK6 */ } -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]); #endif #if VEC_SET == 1281 @@ -11263,7 +11410,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #if defined(BLOCK4) || defined(BLOCK6) -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]); #endif @@ -11285,7 +11432,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]); #endif @@ -11308,7 +11455,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* __ELPA_USE_FMA__ */ #ifdef BLOCK4 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-2]); #endif @@ -11333,7 +11480,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[(ldh*1)+nb-1]); #endif @@ -11355,7 +11502,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-1]); #endif @@ -11386,7 +11533,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-(BLOCK-4)]); #endif @@ -11408,7 +11555,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h w3 = _SIMD_ADD(w3, _SIMD_MUL(q3,h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+nb-(BLOCK-5)]); #endif @@ -11430,7 +11577,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h v3 = _SIMD_ADD(v3, _SIMD_MUL(q3,h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-2)]); #endif #if VEC_SET == 1281 @@ -11454,7 +11601,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-3)]); #endif #if VEC_SET == 1281 @@ -11474,7 +11621,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-4)]); #endif #if VEC_SET == 1281 @@ -11494,7 +11641,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-(BLOCK-5)]); #endif #if VEC_SET == 1281 @@ -11514,7 +11661,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h w3 = _SIMD_ADD(w3, _SIMD_MUL(q3,h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-3)]); #endif #if VEC_SET == 1281 @@ -11538,7 +11685,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-4)]); #endif @@ -11560,7 +11707,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-5)]); #endif @@ -11582,7 +11729,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-4)]); #endif @@ -11609,7 +11756,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-5)]); #endif @@ -11631,7 +11778,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-5)]); #endif @@ -11675,7 +11822,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ///////////////////////////////////////////////////// #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE tau1 = _SIMD_SET1(hh[0]); __SIMD_DATATYPE tau2 = _SIMD_SET1(hh[ldh]); #if defined(BLOCK4) || defined(BLOCK6) @@ -11808,8 +11955,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h h1 = _XOR(tau1, sign); #endif -#if VEC_SET == 1281 - h1 = _fjsp_neg_v2r8(tau1); +#if VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 + h1 = _(tau1); #endif #if VEC_SET == 512 @@ -11842,8 +11989,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h h1 = _XOR(tau2, sign); #endif -#if VEC_SET == 1281 - h1 = _fjsp_neg_v2r8(tau2); +#if VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 + h1 = _SIMD_NEG(tau2); #endif #if VEC_SET == 512 @@ -11998,7 +12145,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[2*offset],q3); #ifdef BLOCK2 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+1]); #endif #if VEC_SET == 1281 @@ -12028,7 +12175,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK2 */ #ifdef BLOCK4 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+1]); #endif @@ -12057,7 +12204,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[ldq+offset],q2); _STORE(&q[ldq+2*offset],q3); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+2]); #endif @@ -12087,7 +12234,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+1]); #endif @@ -12113,7 +12260,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq*2)+offset],q2); _STORE(&q[(ldq*2)+2*offset],q3); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+3]); #endif @@ -12142,7 +12289,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+1]); #endif @@ -12164,7 +12311,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+2]); #endif @@ -12194,7 +12341,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+1]); #endif #if VEC_SET == 1281 @@ -12226,7 +12373,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq+offset)],q2); _STORE(&q[(ldq+2*offset)],q3); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+1]); #endif #if VEC_SET == 1281 @@ -12254,7 +12401,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+2]); #endif #if VEC_SET == 1281 @@ -12278,7 +12425,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq*2)+offset],q2); _STORE(&q[(ldq*2)+2*offset],q3); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+1]); #endif @@ -12308,7 +12455,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+2]); #endif #if VEC_SET == 1281 @@ -12328,7 +12475,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+3]); #endif #if VEC_SET == 1281 @@ -12352,7 +12499,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq*3)+offset],q2); _STORE(&q[(ldq*3)+2*offset],q3); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+1]); #endif #if VEC_SET == 1281 @@ -12380,7 +12527,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+2]); #endif #if VEC_SET == 1281 @@ -12400,7 +12547,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+3]); #endif #if VEC_SET == 1281 @@ -12420,7 +12567,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+4]); #endif #if VEC_SET == 1281 @@ -12444,7 +12591,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(ldq*4)+offset],q2); _STORE(&q[(ldq*4)+2*offset],q3); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[(ldh)+1]); #endif #if VEC_SET == 1281 @@ -12471,7 +12618,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+2]); #endif #if VEC_SET == 1281 @@ -12491,7 +12638,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+3]); #endif #if VEC_SET == 1281 @@ -12511,7 +12658,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+4]); #endif #if VEC_SET == 1281 @@ -12531,7 +12678,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+5]); #endif #if VEC_SET == 1281 @@ -12559,7 +12706,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h for (i = BLOCK; i < nb; i++) { -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[i-(BLOCK-1)]); h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]); #endif @@ -12613,7 +12760,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(y3,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]); #endif @@ -12635,7 +12782,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(z3,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]); #endif @@ -12660,7 +12807,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK4 || BLOCK6 */ #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+i-1]); #endif #if VEC_SET == 1281 @@ -12680,7 +12827,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+i]); #endif #if VEC_SET == 1281 @@ -12706,7 +12853,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(i*ldq)+2*offset],q3); } -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]); #endif #if VEC_SET == 1281 @@ -12745,7 +12892,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]); #endif @@ -12767,7 +12914,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]); #endif @@ -12792,7 +12939,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK4 || BLOCK6 */ #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]); #endif #if VEC_SET == 1281 @@ -12813,7 +12960,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]); #endif #if VEC_SET == 1281 @@ -12841,7 +12988,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #if defined(BLOCK4) || defined(BLOCK6) -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-2)]); #endif @@ -12867,7 +13014,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-3)]); #endif @@ -12890,7 +13037,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]); #endif #if VEC_SET == 1281 @@ -12910,7 +13057,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]); #endif #if VEC_SET == 1281 @@ -12935,7 +13082,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[((nb+1)*ldq)+offset],q2); _STORE(&q[((nb+1)*ldq)+2*offset],q3); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-3)]); #endif @@ -12962,7 +13109,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-2]); #endif #if VEC_SET == 1281 @@ -12982,7 +13129,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]); #endif #if VEC_SET == 1281 @@ -13010,7 +13157,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK4 || BLOCK6 */ #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-2]); #endif #if VEC_SET == 1281 @@ -13034,7 +13181,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-1]); #endif #if VEC_SET == 1281 @@ -13058,7 +13205,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[((nb+3)*ldq)+offset],q2); _STORE(&q[((nb+3)*ldq)+2*offset],q3); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-1]); #endif #if VEC_SET == 1281 @@ -13196,7 +13343,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h __SIMD_DATATYPE x1 = _LOAD(&q[ldq]); __SIMD_DATATYPE x2 = _LOAD(&q[ldq+offset]); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h1 = _SIMD_SET1(hh[ldh+1]); #endif #if VEC_SET == 1281 @@ -13226,7 +13373,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h __SIMD_DATATYPE a3_1 = _LOAD(&q[ldq]); __SIMD_DATATYPE a4_1 = _LOAD(&q[0]); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]); __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]); __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]); @@ -13312,7 +13459,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h __SIMD_DATATYPE a5_1 = _LOAD(&q[ldq]); __SIMD_DATATYPE a6_1 = _LOAD(&q[0]); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_6_5 = _SIMD_SET1(hh[(ldh*5)+1]); __SIMD_DATATYPE h_6_4 = _SIMD_SET1(hh[(ldh*5)+2]); __SIMD_DATATYPE h_6_3 = _SIMD_SET1(hh[(ldh*5)+3]); @@ -13350,7 +13497,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h t1 = _SIMD_ADD(t1, _SIMD_MUL(a1_1, h_6_1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_5_4 = _SIMD_SET1(hh[(ldh*4)+1]); __SIMD_DATATYPE h_5_3 = _SIMD_SET1(hh[(ldh*4)+2]); __SIMD_DATATYPE h_5_2 = _SIMD_SET1(hh[(ldh*4)+3]); @@ -13383,7 +13530,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h v1 = _SIMD_ADD(v1, _SIMD_MUL(a1_1, h_5_1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]); __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]); __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]); @@ -13411,7 +13558,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]); __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]); __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]); @@ -13498,7 +13645,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h for(i = BLOCK; i < nb; i++) { -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[i-(BLOCK-1)]); h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]); #endif @@ -13526,7 +13673,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* __ELPA_USE_FMA__ */ #if defined(BLOCK4) || defined(BLOCK6) -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]); #endif @@ -13546,7 +13693,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]); #endif @@ -13570,7 +13717,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+i-1]); #endif #if VEC_SET == 1281 @@ -13588,7 +13735,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h v2 = _SIMD_ADD(v2, _SIMD_MUL(q2,h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+i-(BLOCK-6)]); #endif @@ -13610,7 +13757,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK6 */ } -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]); #endif #if VEC_SET == 1281 @@ -13632,7 +13779,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #if defined(BLOCK4) || defined(BLOCK6) -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]); #endif @@ -13652,7 +13799,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]); #endif @@ -13674,7 +13821,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK4 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-2]); #endif @@ -13697,7 +13844,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[(ldh*1)+nb-1]); #endif @@ -13717,7 +13864,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-1]); #endif @@ -13745,7 +13892,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-(BLOCK-4)]); #endif @@ -13765,7 +13912,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]); #endif @@ -13785,7 +13932,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h v2 = _SIMD_ADD(v2, _SIMD_MUL(q2,h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-2)]); #endif #if VEC_SET == 1281 @@ -13806,7 +13953,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-3)]); #endif #if VEC_SET == 1281 @@ -13824,7 +13971,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-4)]); #endif #if VEC_SET == 1281 @@ -13842,7 +13989,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-(BLOCK-5)]); #endif #if VEC_SET == 1281 @@ -13860,7 +14007,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-3)]); #endif #if VEC_SET == 1281 @@ -13881,7 +14028,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-4)]); #endif @@ -13901,7 +14048,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-5)]); #endif @@ -13921,7 +14068,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-2]); #endif @@ -13944,7 +14091,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-1]); #endif @@ -13964,7 +14111,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-1]); #endif @@ -14004,7 +14151,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ///////////////////////////////////////////////////// #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE tau1 = _SIMD_SET1(hh[0]); __SIMD_DATATYPE tau2 = _SIMD_SET1(hh[ldh]); #if defined(BLOCK4) || defined(BLOCK6) @@ -14137,8 +14284,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h h1 = _XOR(tau1, sign); #endif -#if VEC_SET == 1281 - h1 = _fjsp_neg_v2r8(tau1); +#if VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 + h1 = _SIMD_NEG(tau1); #endif #if VEC_SET == 512 @@ -14170,8 +14317,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h h1 = _XOR(tau2, sign); #endif -#if VEC_SET == 1281 - h1 = _fjsp_neg_v2r8(tau2); +#if VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 + h1 = _SIMD_NEG(tau2); #endif #if VEC_SET == 512 @@ -14301,7 +14448,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[offset],q2); #ifdef BLOCK2 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+1]); #endif #if VEC_SET == 1281 @@ -14329,7 +14476,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK2 */ #ifdef BLOCK4 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+1]); #endif @@ -14355,7 +14502,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[ldq],q1); _STORE(&q[ldq+offset],q2); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+2]); #endif @@ -14380,7 +14527,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+1]); #endif @@ -14403,7 +14550,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[ldq*2],q1); _STORE(&q[(ldq*2)+offset],q2); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+3]); #endif @@ -14428,7 +14575,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+1]); #endif @@ -14448,7 +14595,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+2]); #endif @@ -14475,7 +14622,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+1]); #endif #if VEC_SET == 1281 @@ -14501,7 +14648,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[ldq],q1); _STORE(&q[(ldq+offset)],q2); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+1]); #endif #if VEC_SET == 1281 @@ -14523,7 +14670,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+2]); #endif #if VEC_SET == 1281 @@ -14544,7 +14691,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[ldq*2],q1); _STORE(&q[(ldq*2)+offset],q2); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+1]); #endif @@ -14570,7 +14717,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+2]); #endif #if VEC_SET == 1281 @@ -14588,7 +14735,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+3]); #endif #if VEC_SET == 1281 @@ -14609,7 +14756,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[ldq*3],q1); _STORE(&q[(ldq*3)+offset],q2); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+1]); #endif #if VEC_SET == 1281 @@ -14632,7 +14779,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+2]); #endif #if VEC_SET == 1281 @@ -14650,7 +14797,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+3]); #endif #if VEC_SET == 1281 @@ -14668,7 +14815,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+4]); #endif #if VEC_SET == 1281 @@ -14689,7 +14836,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[ldq*4],q1); _STORE(&q[(ldq*4)+offset],q2); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[(ldh)+1]); #endif #if VEC_SET == 1281 @@ -14711,7 +14858,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+2]); #endif #if VEC_SET == 1281 @@ -14729,7 +14876,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+3]); #endif #if VEC_SET == 1281 @@ -14747,7 +14894,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+4]); #endif #if VEC_SET == 1281 @@ -14765,7 +14912,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+5]); #endif #if VEC_SET == 1281 @@ -14789,7 +14936,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h for (i = BLOCK; i < nb; i++) { -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[i-(BLOCK-1)]); h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]); #endif @@ -14835,7 +14982,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(y2,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]); #endif @@ -14855,7 +15002,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(z2,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]); #endif @@ -14878,7 +15025,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK4 || BLOCK6*/ #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+i-1]); #endif #if VEC_SET == 1281 @@ -14896,7 +15043,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+i]); #endif #if VEC_SET == 1281 @@ -14919,7 +15066,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(i*ldq)+offset],q2); } -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]); #endif #if VEC_SET == 1281 @@ -14953,7 +15100,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]); #endif @@ -14973,7 +15120,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]); #endif @@ -14996,7 +15143,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK4 || BLOCK6 */ #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]); #endif #if VEC_SET == 1281 @@ -15014,7 +15161,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]); #endif #if VEC_SET == 1281 @@ -15039,7 +15186,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #if defined(BLOCK4) || defined(BLOCK6) -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-2)]); #endif @@ -15062,7 +15209,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-3)]); #endif @@ -15083,7 +15230,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]); #endif #if VEC_SET == 1281 @@ -15101,7 +15248,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]); #endif #if VEC_SET == 1281 @@ -15123,7 +15270,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(nb+1)*ldq],q1); _STORE(&q[((nb+1)*ldq)+offset],q2); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-3)]); #endif @@ -15147,7 +15294,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-2]); #endif #if VEC_SET == 1281 @@ -15165,7 +15312,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]); #endif #if VEC_SET == 1281 @@ -15191,7 +15338,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK4 || BLOCK6*/ #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-2]); #endif #if VEC_SET == 1281 @@ -15212,7 +15359,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-1]); #endif #if VEC_SET == 1281 @@ -15233,7 +15380,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[(nb+3)*ldq],q1); _STORE(&q[((nb+3)*ldq)+offset],q2); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-1]); #endif #if VEC_SET == 1281 @@ -15367,7 +15514,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* VEC_SET == 512 */ __SIMD_DATATYPE x1 = _LOAD(&q[ldq]); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h1 = _SIMD_SET1(hh[ldh+1]); #endif #if VEC_SET == 1281 @@ -15393,7 +15540,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h __SIMD_DATATYPE a3_1 = _LOAD(&q[ldq]); __SIMD_DATATYPE a4_1 = _LOAD(&q[0]); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]); __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]); __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]); @@ -15455,7 +15602,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h __SIMD_DATATYPE a5_1 = _LOAD(&q[ldq]); __SIMD_DATATYPE a6_1 = _LOAD(&q[0]); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_6_5 = _SIMD_SET1(hh[(ldh*5)+1]); __SIMD_DATATYPE h_6_4 = _SIMD_SET1(hh[(ldh*5)+2]); __SIMD_DATATYPE h_6_3 = _SIMD_SET1(hh[(ldh*5)+3]); @@ -15493,7 +15640,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h t1 = _SIMD_ADD(t1, _SIMD_MUL(a1_1, h_6_1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_5_4 = _SIMD_SET1(hh[(ldh*4)+1]); __SIMD_DATATYPE h_5_3 = _SIMD_SET1(hh[(ldh*4)+2]); __SIMD_DATATYPE h_5_2 = _SIMD_SET1(hh[(ldh*4)+3]); @@ -15526,7 +15673,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h v1 = _SIMD_ADD(v1, _SIMD_MUL(a1_1, h_5_1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]); __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]); __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]); @@ -15554,7 +15701,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]); __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]); __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]); @@ -15597,7 +15744,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h for(i = BLOCK; i < nb; i++) { -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[i-(BLOCK-1)]); h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]); #endif @@ -15621,7 +15768,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* __ELPA_USE_FMA__ */ #if defined(BLOCK4) || defined(BLOCK6) -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]); #endif @@ -15639,7 +15786,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]); #endif @@ -15660,7 +15807,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+i-(BLOCK-5)]); #endif #if VEC_SET == 1281 @@ -15676,7 +15823,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h v1 = _SIMD_ADD(v1, _SIMD_MUL(q1,h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+i]); #endif @@ -15696,7 +15843,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK6 */ } -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]); #endif #if VEC_SET == 1281 @@ -15716,7 +15863,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #if defined(BLOCK4) || defined(BLOCK6) -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]); #endif @@ -15734,7 +15881,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]); #endif @@ -15754,7 +15901,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK4 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-2]); #endif @@ -15774,7 +15921,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[(ldh*1)+nb-1]); #endif @@ -15792,7 +15939,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-1]); #endif @@ -15816,7 +15963,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]); #endif @@ -15834,7 +15981,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]); #endif @@ -15852,7 +15999,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h v1 = _SIMD_ADD(v1, _SIMD_MUL(q1,h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-4]); #endif #if VEC_SET == 1281 @@ -15870,7 +16017,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-3]); #endif #if VEC_SET == 1281 @@ -15886,7 +16033,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]); #endif #if VEC_SET == 1281 @@ -15902,7 +16049,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]); #endif #if VEC_SET == 1281 @@ -15918,7 +16065,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-3]); #endif #if VEC_SET == 1281 @@ -15936,7 +16083,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-2]); #endif @@ -15954,7 +16101,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]); #endif @@ -15972,7 +16119,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-2]); #endif @@ -15992,7 +16139,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-1]); #endif @@ -16010,7 +16157,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-1]); #endif @@ -16049,7 +16196,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 __SIMD_DATATYPE tau1 = _SIMD_SET1(hh[0]); __SIMD_DATATYPE tau2 = _SIMD_SET1(hh[ldh]); #if defined(BLOCK4) || defined(BLOCK6) @@ -16182,8 +16329,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h h1 = _XOR(tau1, sign); #endif -#if VEC_SET == 1281 - h1 = _fjsp_neg_v2r8(tau1); +#if VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 + h1 = _SIMD_NEG(tau1); #endif #if VEC_SET == 512 @@ -16213,8 +16360,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h h1 = _XOR(tau2, sign); #endif -#if VEC_SET == 1281 - h1 = _fjsp_neg_v2r8(tau2); +#if VEC_SET == 1281 || VEC_SET == NEON_ARCH64_128 + h1 = _SIMD_NEG(tau2); #endif #if VEC_SET == 512 @@ -16323,7 +16470,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[0],q1); #ifdef BLOCK2 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+1]); #endif #if VEC_SET == 1281 @@ -16345,7 +16492,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK2 */ #ifdef BLOCK4 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+1]); #endif @@ -16367,7 +16514,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[ldq],q1); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+2]); #endif @@ -16388,7 +16535,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+1]); #endif @@ -16408,7 +16555,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[ldq*2],q1); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+3]); #endif @@ -16429,7 +16576,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+1]); #endif @@ -16447,7 +16594,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+2]); #endif @@ -16471,7 +16618,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+1]); #endif #if VEC_SET == 1281 @@ -16492,7 +16639,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[ldq],q1); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+1]); #endif #if VEC_SET == 1281 @@ -16510,7 +16657,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+2]); #endif #if VEC_SET == 1281 @@ -16528,7 +16675,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[ldq*2],q1); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+1]); #endif @@ -16548,7 +16695,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+2]); #endif #if VEC_SET == 1281 @@ -16564,7 +16711,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+3]); #endif #if VEC_SET == 1281 @@ -16582,7 +16729,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[ldq*3],q1); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+1]); #endif #if VEC_SET == 1281 @@ -16601,7 +16748,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+2]); #endif #if VEC_SET == 1281 @@ -16617,7 +16764,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+3]); #endif #if VEC_SET == 1281 @@ -16633,7 +16780,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+4]); #endif #if VEC_SET == 1281 @@ -16651,7 +16798,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[ldq*4],q1); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[(ldh)+1]); #endif #if VEC_SET == 1281 @@ -16669,7 +16816,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+2]); #endif #if VEC_SET == 1281 @@ -16685,7 +16832,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+3]); #endif #if VEC_SET == 1281 @@ -16701,7 +16848,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+4]); #endif #if VEC_SET == 1281 @@ -16717,7 +16864,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+5]); #endif #if VEC_SET == 1281 @@ -16739,7 +16886,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h for (i = BLOCK; i < nb; i++) { -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[i-(BLOCK-1)]); h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]); #endif @@ -16777,7 +16924,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(y1,h2)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]); #endif @@ -16795,7 +16942,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(z1,h3)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]); #endif @@ -16816,7 +16963,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK4 || BLOCK6*/ #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+i-(BLOCK-5)]); #endif #if VEC_SET == 1281 @@ -16833,7 +16980,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h6 = _SIMD_SET1(hh[(ldh*5)+i-(BLOCK-6)]); #endif #if VEC_SET == 1281 @@ -16854,7 +17001,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h _STORE(&q[i*ldq],q1); } -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]); #endif #if VEC_SET == 1281 @@ -16883,7 +17030,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]); #endif @@ -16901,7 +17048,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]); #endif @@ -16922,7 +17069,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK4 || BLOCK6 */ #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-(BLOCK-4)]); #endif #if VEC_SET == 1281 @@ -16938,7 +17085,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h5 = _SIMD_SET1(hh[(ldh*4)+nb-(BLOCK-5)]); #endif #if VEC_SET == 1281 @@ -16960,7 +17107,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #if defined(BLOCK4) || defined(BLOCK6) -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-2)]); #endif @@ -16980,7 +17127,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-3)]); #endif @@ -16999,7 +17146,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-4)]); #endif #if VEC_SET == 1281 @@ -17015,7 +17162,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3)); #endif -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h4 = _SIMD_SET1(hh[(ldh*3)+nb-(BLOCK-5)]); #endif #if VEC_SET == 1281 @@ -17034,7 +17181,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK6 */ _STORE(&q[(nb+1)*ldq],q1); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-(BLOCK-3)]); #endif @@ -17055,7 +17202,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-2]); #endif #if VEC_SET == 1281 @@ -17072,7 +17219,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]); #endif #if VEC_SET == 1281 @@ -17095,7 +17242,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* BLOCK4 || BLOCK6*/ #ifdef BLOCK6 -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-2]); #endif #if VEC_SET == 1281 @@ -17113,7 +17260,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1)); #endif /* __ELPA_USE_FMA__ */ -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h2 = _SIMD_SET1(hh[ldh+nb-1]); #endif #if VEC_SET == 1281 @@ -17130,7 +17277,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h #endif /* __ELPA_USE_FMA__ */ _STORE(&q[(nb+3)*ldq],q1); -#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 +#if VEC_SET == 128 || VEC_SET == 512 || VEC_SET == 1282 || VEC_SET == NEON_ARCH64_128 h1 = _SIMD_SET1(hh[nb-1]); #endif #if VEC_SET == 1281 diff --git a/src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c b/src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c new file mode 100644 index 00000000..237f5568 --- /dev/null +++ b/src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c @@ -0,0 +1,59 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaften, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// Author: Andreas Marek, MPCDF + +#include "config-f90.h" + +#define REALCASE 1 +#define DOUBLE_PRECISION 1 +#define BLOCK2 1 +#define VEC_SET NEON_ARCH64_128 +#include "../../general/precision_macros.h" +#include "real_128bit_256bit_512bit_BLOCK_template.c" +#undef BLOCK2 +#undef VEC_SET +#undef REALCASE +#undef DOUBLE_PRECISION + diff --git a/src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c b/src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c new file mode 100644 index 00000000..30dd8517 --- /dev/null +++ b/src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c @@ -0,0 +1,59 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaften, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// Author: Andreas Marek, MPCDF + +#include "config-f90.h" + +#define REALCASE 1 +#define SINGLE_PRECISION 1 +#define BLOCK2 1 +#define VEC_SET NEON_ARCH64_128 +#include "../../general/precision_macros.h" +#include "real_128bit_256bit_512bit_BLOCK_template.c" +#undef BLOCK2 +#undef VEC_SET +#undef REALCASE +#undef SINGLE_PRECISION + diff --git a/src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c b/src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c new file mode 100644 index 00000000..46758e8b --- /dev/null +++ b/src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c @@ -0,0 +1,59 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaften, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// Author: Andreas Marek, MPCDF + +#include "config-f90.h" + +#define REALCASE 1 +#define DOUBLE_PRECISION 1 +#define BLOCK4 1 +#define VEC_SET NEON_ARCH64_128 +#include "../../general/precision_macros.h" +#include "real_128bit_256bit_512bit_BLOCK_template.c" +#undef BLOCK4 +#undef VEC_SET +#undef REALCASE +#undef DOUBLE_PRECISION + diff --git a/src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c b/src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c new file mode 100644 index 00000000..e8f0bdb6 --- /dev/null +++ b/src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c @@ -0,0 +1,59 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaften, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// Author: Andreas Marek, MPCDF + +#include "config-f90.h" + +#define REALCASE 1 +#define SINGLE_PRECISION 1 +#define BLOCK4 1 +#define VEC_SET NEON_ARCH64_128 +#include "../../general/precision_macros.h" +#include "real_128bit_256bit_512bit_BLOCK_template.c" +#undef BLOCK4 +#undef VEC_SET +#undef REALCASE +#undef SINGLE_PRECISION + diff --git a/src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c b/src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c new file mode 100644 index 00000000..d8c9fd91 --- /dev/null +++ b/src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c @@ -0,0 +1,59 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaften, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// Author: Andreas Marek, MPCDF + +#include "config-f90.h" + +#define REALCASE 1 +#define DOUBLE_PRECISION 1 +#define BLOCK6 1 +#define VEC_SET NEON_ARCH64_128 +#include "../../general/precision_macros.h" +#include "real_128bit_256bit_512bit_BLOCK_template.c" +#undef REALCASE +#undef VEC_SET +#undef BLOCK6 +#undef DOUBLE_PRECISION + diff --git a/src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c b/src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c new file mode 100644 index 00000000..2d2a0b84 --- /dev/null +++ b/src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c @@ -0,0 +1,59 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaften, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// Author: Andreas Marek, MPCDF + +#include "config-f90.h" + +#define REALCASE 1 +#define SINGLE_PRECISION 1 +#define BLOCK6 1 +#define VEC_SET NEON_ARCH64_128 +#include "../../general/precision_macros.h" +#include "real_128bit_256bit_512bit_BLOCK_template.c" +#undef REALCASE +#undef VEC_SET +#undef BLOCK6 +#undef SINGLE__PRECISION + -- GitLab