From 1f5874ecc004b25a6aa167a2f9cf2e8c277c3410 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 9 Jan 2019 15:53:01 +0100
Subject: [PATCH] streamlining

---
 libsharp/sharp_core.c       |  2 +-
 libsharp/sharp_vecsupport.h | 51 ++++++++++++++++++-------------------
 libsharp/sharp_ylmgen_c.c   |  2 +-
 libsharp/sharp_ylmgen_c.h   |  2 +-
 4 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
index 2b8c3af..5bf74a8 100644
--- a/libsharp/sharp_core.c
+++ b/libsharp/sharp_core.c
@@ -25,7 +25,7 @@
 /*! \file sharp_core.c
  *  Computational core
  *
- *  Copyright (C) 2012-2018 Max-Planck-Society
+ *  Copyright (C) 2012-2019 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index 43200f8..ee09adf 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -25,7 +25,7 @@
 /*  \file sharp_vecsupport.h
  *  Convenience functions for vector arithmetics
  *
- *  Copyright (C) 2012-2016 Max-Planck-Society
+ *  Copyright (C) 2012-2019 Max-Planck-Society
  *  Author: Martin Reinecke
  */
 
@@ -42,11 +42,14 @@ typedef double Ts;
 typedef double Tv;
 typedef int Tm;
 
+#define vload(a) (a)
+#define vzero 0.
+#define vone 1.
+
 #define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
 #define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
 #define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
 #define vneg(a) (-(a))
-#define vload(a) (a)
 #define vabs(a) fabs(a)
 #define vsqrt(a) sqrt(a)
 #define vlt(a,b) ((a)<(b))
@@ -55,14 +58,10 @@ typedef int Tm;
 #define vne(a,b) ((a)!=(b))
 #define vand_mask(a,b) ((a)&&(b))
 #define vor_mask(a,b) ((a)||(b))
-
 static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
 static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
-
 #define vanyTrue(a) (a)
 #define vallTrue(a) (a)
-#define vzero 0.
-#define vone 1.
 
 #endif
 
@@ -86,15 +85,15 @@ typedef __m128d Tm;
 static inline Tv vblend__(Tv m, Tv a, Tv b)
   { return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); }
 #endif
+#define vload(a) _mm_set1_pd(a)
 #define vzero _mm_setzero_pd()
-#define vone _mm_set1_pd(1.)
+#define vone vload(1.)
 
-#define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero))
-#define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero))
-#define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone))
-#define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a)
-#define vload(a) _mm_set1_pd(a)
-#define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a)
+#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero)
+#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero)
+#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone)
+#define vneg(a) _mm_xor_pd(vload(-0.),a)
+#define vabs(a) _mm_andnot_pd(vload(-0.),a)
 #define vsqrt(a) _mm_sqrt_pd(a)
 #define vlt(a,b) _mm_cmplt_pd(a,b)
 #define vgt(a,b) _mm_cmpgt_pd(a,b)
@@ -117,15 +116,15 @@ typedef __m256d Tv;
 typedef __m256d Tm;
 
 #define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
+#define vload(a) _mm256_set1_pd(a)
 #define vzero _mm256_setzero_pd()
-#define vone _mm256_set1_pd(1.)
+#define vone vload(1.)
 
-#define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero))
-#define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero))
-#define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone))
-#define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
-#define vload(a) _mm256_set1_pd(a)
-#define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a)
+#define vaddeq_mask(mask,a,b) a+=vblend__(mask,b,vzero)
+#define vsubeq_mask(mask,a,b) a-=vblend__(mask,b,vzero)
+#define vmuleq_mask(mask,a,b) a*=vblend__(mask,b,vone)
+#define vneg(a) _mm256_xor_pd(vload(-0.),a)
+#define vabs(a) _mm256_andnot_pd(vload(-0.),a)
 #define vsqrt(a) _mm256_sqrt_pd(a)
 #define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
 #define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ)
@@ -147,12 +146,15 @@ typedef __m256d Tm;
 typedef __m512d Tv;
 typedef __mmask8 Tm;
 
+#define vload(a) _mm512_set1_pd(a)
+#define vzero _mm512_setzero_pd()
+#define vone vload(1.)
+
 #define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
 #define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
 #define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
-#define vneg(a) _mm512_mul_pd(a,_mm512_set1_pd(-1.))
-#define vload(a) _mm512_set1_pd(a)
-#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)_mm512_set1_pd(-0.),(__m512i)a)
+#define vneg(a) _mm512_mul_pd(a,vload(-1.))
+#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)vload(-0.),(__m512i)a)
 #define vsqrt(a) _mm512_sqrt_pd(a)
 #define vlt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_LT_OQ)
 #define vgt(a,b) _mm512_cmp_pd_mask(a,b,_CMP_GT_OQ)
@@ -165,9 +167,6 @@ typedef __mmask8 Tm;
 #define vanyTrue(a) (a!=0)
 #define vallTrue(a) (a==255)
 
-#define vzero _mm512_setzero_pd()
-#define vone _mm512_set1_pd(1.)
-
 #endif
 
 #endif
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index 5bea19e..e3c055b 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -25,7 +25,7 @@
 /*
  *  Helper code for efficient calculation of Y_lm(theta,phi=0)
  *
- *  Copyright (C) 2005-2016 Max-Planck-Society
+ *  Copyright (C) 2005-2019 Max-Planck-Society
  *  Author: Martin Reinecke
  */
 
diff --git a/libsharp/sharp_ylmgen_c.h b/libsharp/sharp_ylmgen_c.h
index 5fd7f93..b1d9cbc 100644
--- a/libsharp/sharp_ylmgen_c.h
+++ b/libsharp/sharp_ylmgen_c.h
@@ -25,7 +25,7 @@
 /*! \file sharp_ylmgen_c.h
  *  Code for efficient calculation of Y_lm(phi=0,theta)
  *
- *  Copyright (C) 2005-2016 Max-Planck-Society
+ *  Copyright (C) 2005-2019 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
-- 
GitLab