From 92c1ded07513594a085dcd668d91afabf004001a Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 31 Jul 2019 14:40:53 +0200
Subject: [PATCH 1/3] first rearrangements

---
 pocketfft_hdronly.h | 48 ++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h
index 9eeef63..172cc54 100644
--- a/pocketfft_hdronly.h
+++ b/pocketfft_hdronly.h
@@ -915,19 +915,19 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
     for (size_t k=0; k<l1; ++k)
       {
       T a0, a1, a2, a3, a4, a5, a6, a7;
-      PMC(a0,a4,CC(0,0,k),CC(0,4,k));
       PMC(a1,a5,CC(0,1,k),CC(0,5,k));
-      PMC(a2,a6,CC(0,2,k),CC(0,6,k));
       PMC(a3,a7,CC(0,3,k),CC(0,7,k));
-      ROTX90<fwd>(a6);
       ROTX90<fwd>(a7);
-      PMINPLACE(a0,a2);
       PMINPLACE(a1,a3);
-      PMINPLACE(a4,a6);
+      ROTX90<fwd>(a3);
       PMINPLACE(a5,a7);
       ROTX45<fwd>(a5);
-      ROTX90<fwd>(a3);
       ROTX135<fwd>(a7);
+      PMC(a0,a4,CC(0,0,k),CC(0,4,k));
+      PMC(a2,a6,CC(0,2,k),CC(0,6,k));
+      ROTX90<fwd>(a6);
+      PMINPLACE(a0,a2);
+      PMINPLACE(a4,a6);
       PMC(CH(0,k,0),CH(0,k,4),a0,a1);
       PMC(CH(0,k,1),CH(0,k,5),a4,a5);
       PMC(CH(0,k,2),CH(0,k,6),a2,a3);
@@ -937,19 +937,19 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
     for (size_t k=0; k<l1; ++k)
       {
       T a0, a1, a2, a3, a4, a5, a6, a7;
-      PMC(a0,a4,CC(0,0,k),CC(0,4,k));
       PMC(a1,a5,CC(0,1,k),CC(0,5,k));
-      PMC(a2,a6,CC(0,2,k),CC(0,6,k));
       PMC(a3,a7,CC(0,3,k),CC(0,7,k));
-      ROTX90<fwd>(a6);
       ROTX90<fwd>(a7);
-      PMINPLACE(a0,a2);
       PMINPLACE(a1,a3);
-      PMINPLACE(a4,a6);
+      ROTX90<fwd>(a3);
       PMINPLACE(a5,a7);
       ROTX45<fwd>(a5);
-      ROTX90<fwd>(a3);
       ROTX135<fwd>(a7);
+      PMC(a0,a4,CC(0,0,k),CC(0,4,k));
+      PMC(a2,a6,CC(0,2,k),CC(0,6,k));
+      ROTX90<fwd>(a6);
+      PMINPLACE(a0,a2);
+      PMINPLACE(a4,a6);
       PMC(CH(0,k,0),CH(0,k,4),a0,a1);
       PMC(CH(0,k,1),CH(0,k,5),a4,a5);
       PMC(CH(0,k,2),CH(0,k,6),a2,a3);
@@ -958,30 +958,30 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
       for (size_t i=1; i<ido; ++i)
         {
         T a0, a1, a2, a3, a4, a5, a6, a7;
-        PMC(a0,a4,CC(i,0,k),CC(i,4,k));
         PMC(a1,a5,CC(i,1,k),CC(i,5,k));
-        PMC(a2,a6,CC(i,2,k),CC(i,6,k));
         PMC(a3,a7,CC(i,3,k),CC(i,7,k));
-        ROTX90<fwd>(a6);
         ROTX90<fwd>(a7);
-        PMINPLACE(a0,a2);
         PMINPLACE(a1,a3);
-        PMINPLACE(a4,a6);
+        ROTX90<fwd>(a3);
         PMINPLACE(a5,a7);
         ROTX45<fwd>(a5);
-        ROTX90<fwd>(a3);
         ROTX135<fwd>(a7);
+        PMC(a0,a4,CC(i,0,k),CC(i,4,k));
+        PMC(a2,a6,CC(i,2,k),CC(i,6,k));
+        ROTX90<fwd>(a6);
+        PMINPLACE(a0,a2);
+        PMINPLACE(a4,a6);
         PMINPLACE(a0,a1);
+        CH(i,k,0) = a0;
+        CH(i,k,4) = a1.template special_mul<fwd>(WA(3,i));
         PMINPLACE(a2,a3);
+        CH(i,k,2) = a2.template special_mul<fwd>(WA(1,i));
+        CH(i,k,6) = a3.template special_mul<fwd>(WA(5,i));
         PMINPLACE(a4,a5);
-        PMINPLACE(a6,a7);
-        CH(i,k,0) = a0;
         CH(i,k,1) = a4.template special_mul<fwd>(WA(0,i));
-        CH(i,k,2) = a2.template special_mul<fwd>(WA(1,i));
-        CH(i,k,3) = a6.template special_mul<fwd>(WA(2,i));
-        CH(i,k,4) = a1.template special_mul<fwd>(WA(3,i));
         CH(i,k,5) = a5.template special_mul<fwd>(WA(4,i));
-        CH(i,k,6) = a3.template special_mul<fwd>(WA(5,i));
+        PMINPLACE(a6,a7);
+        CH(i,k,3) = a6.template special_mul<fwd>(WA(2,i));
         CH(i,k,7) = a7.template special_mul<fwd>(WA(6,i));
         }
       }
-- 
GitLab


From f926ace25bfba1ed3488f3de60b36f5a47d95fc9 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 31 Jul 2019 15:12:22 +0200
Subject: [PATCH 2/3] more minor tweaks

---
 pocketfft_hdronly.h | 43 ++++++++++++++++++-------------------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h
index 172cc54..0c90f55 100644
--- a/pocketfft_hdronly.h
+++ b/pocketfft_hdronly.h
@@ -926,15 +926,14 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
       PMC(a0,a4,CC(0,0,k),CC(0,4,k));
       PMC(a2,a6,CC(0,2,k),CC(0,6,k));
       ROTX90<fwd>(a6);
-      PMINPLACE(a0,a2);
-      PMINPLACE(a4,a6);
-      PMC(CH(0,k,0),CH(0,k,4),a0,a1);
-      PMC(CH(0,k,1),CH(0,k,5),a4,a5);
-      PMC(CH(0,k,2),CH(0,k,6),a2,a3);
-      PMC(CH(0,k,3),CH(0,k,7),a6,a7);
+      PMC(CH(0,k,0),CH(0,k,4),a0+a2,a1);
+      PMC(CH(0,k,1),CH(0,k,5),a4+a6,a5);
+      PMC(CH(0,k,2),CH(0,k,6),a0-a2,a3);
+      PMC(CH(0,k,3),CH(0,k,7),a4-a6,a7);
       }
   else
     for (size_t k=0; k<l1; ++k)
+      {
       {
       T a0, a1, a2, a3, a4, a5, a6, a7;
       PMC(a1,a5,CC(0,1,k),CC(0,5,k));
@@ -948,13 +947,11 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
       PMC(a0,a4,CC(0,0,k),CC(0,4,k));
       PMC(a2,a6,CC(0,2,k),CC(0,6,k));
       ROTX90<fwd>(a6);
-      PMINPLACE(a0,a2);
-      PMINPLACE(a4,a6);
-      PMC(CH(0,k,0),CH(0,k,4),a0,a1);
-      PMC(CH(0,k,1),CH(0,k,5),a4,a5);
-      PMC(CH(0,k,2),CH(0,k,6),a2,a3);
-      PMC(CH(0,k,3),CH(0,k,7),a6,a7);
-
+      PMC(CH(0,k,0),CH(0,k,4),a0+a2,a1);
+      PMC(CH(0,k,1),CH(0,k,5),a4+a6,a5);
+      PMC(CH(0,k,2),CH(0,k,6),a0-a2,a3);
+      PMC(CH(0,k,3),CH(0,k,7),a4-a6,a7);
+      }
       for (size_t i=1; i<ido; ++i)
         {
         T a0, a1, a2, a3, a4, a5, a6, a7;
@@ -971,18 +968,14 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
         ROTX90<fwd>(a6);
         PMINPLACE(a0,a2);
         PMINPLACE(a4,a6);
-        PMINPLACE(a0,a1);
-        CH(i,k,0) = a0;
-        CH(i,k,4) = a1.template special_mul<fwd>(WA(3,i));
-        PMINPLACE(a2,a3);
-        CH(i,k,2) = a2.template special_mul<fwd>(WA(1,i));
-        CH(i,k,6) = a3.template special_mul<fwd>(WA(5,i));
-        PMINPLACE(a4,a5);
-        CH(i,k,1) = a4.template special_mul<fwd>(WA(0,i));
-        CH(i,k,5) = a5.template special_mul<fwd>(WA(4,i));
-        PMINPLACE(a6,a7);
-        CH(i,k,3) = a6.template special_mul<fwd>(WA(2,i));
-        CH(i,k,7) = a7.template special_mul<fwd>(WA(6,i));
+        CH(i,k,0) = a0+a1;
+        CH(i,k,4) = (a0-a1).template special_mul<fwd>(WA(3,i));
+        CH(i,k,1) = (a4+a5).template special_mul<fwd>(WA(0,i));
+        CH(i,k,5) = (a4-a5).template special_mul<fwd>(WA(4,i));
+        CH(i,k,2) = (a2+a3).template special_mul<fwd>(WA(1,i));
+        CH(i,k,6) = (a2-a3).template special_mul<fwd>(WA(5,i));
+        CH(i,k,3) = (a6+a7).template special_mul<fwd>(WA(2,i));
+        CH(i,k,7) = (a6-a7).template special_mul<fwd>(WA(6,i));
         }
       }
    }
-- 
GitLab


From 140e68f5787366d058fa50f2a3a0d16815177dd5 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 1 Aug 2019 12:26:30 +0200
Subject: [PATCH 3/3] more tweaks

---
 pocketfft_hdronly.h | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h
index 0c90f55..403d8c9 100644
--- a/pocketfft_hdronly.h
+++ b/pocketfft_hdronly.h
@@ -917,18 +917,20 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
       T a0, a1, a2, a3, a4, a5, a6, a7;
       PMC(a1,a5,CC(0,1,k),CC(0,5,k));
       PMC(a3,a7,CC(0,3,k),CC(0,7,k));
-      ROTX90<fwd>(a7);
       PMINPLACE(a1,a3);
       ROTX90<fwd>(a3);
+
+      ROTX90<fwd>(a7);
       PMINPLACE(a5,a7);
       ROTX45<fwd>(a5);
       ROTX135<fwd>(a7);
+
       PMC(a0,a4,CC(0,0,k),CC(0,4,k));
       PMC(a2,a6,CC(0,2,k),CC(0,6,k));
-      ROTX90<fwd>(a6);
       PMC(CH(0,k,0),CH(0,k,4),a0+a2,a1);
-      PMC(CH(0,k,1),CH(0,k,5),a4+a6,a5);
       PMC(CH(0,k,2),CH(0,k,6),a0-a2,a3);
+      ROTX90<fwd>(a6);
+      PMC(CH(0,k,1),CH(0,k,5),a4+a6,a5);
       PMC(CH(0,k,3),CH(0,k,7),a4-a6,a7);
       }
   else
@@ -938,18 +940,20 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
       T a0, a1, a2, a3, a4, a5, a6, a7;
       PMC(a1,a5,CC(0,1,k),CC(0,5,k));
       PMC(a3,a7,CC(0,3,k),CC(0,7,k));
-      ROTX90<fwd>(a7);
       PMINPLACE(a1,a3);
       ROTX90<fwd>(a3);
+
+      ROTX90<fwd>(a7);
       PMINPLACE(a5,a7);
       ROTX45<fwd>(a5);
       ROTX135<fwd>(a7);
+
       PMC(a0,a4,CC(0,0,k),CC(0,4,k));
       PMC(a2,a6,CC(0,2,k),CC(0,6,k));
-      ROTX90<fwd>(a6);
       PMC(CH(0,k,0),CH(0,k,4),a0+a2,a1);
-      PMC(CH(0,k,1),CH(0,k,5),a4+a6,a5);
       PMC(CH(0,k,2),CH(0,k,6),a0-a2,a3);
+      ROTX90<fwd>(a6);
+      PMC(CH(0,k,1),CH(0,k,5),a4+a6,a5);
       PMC(CH(0,k,3),CH(0,k,7),a4-a6,a7);
       }
       for (size_t i=1; i<ido; ++i)
@@ -965,15 +969,15 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
         ROTX135<fwd>(a7);
         PMC(a0,a4,CC(i,0,k),CC(i,4,k));
         PMC(a2,a6,CC(i,2,k),CC(i,6,k));
-        ROTX90<fwd>(a6);
         PMINPLACE(a0,a2);
-        PMINPLACE(a4,a6);
         CH(i,k,0) = a0+a1;
         CH(i,k,4) = (a0-a1).template special_mul<fwd>(WA(3,i));
-        CH(i,k,1) = (a4+a5).template special_mul<fwd>(WA(0,i));
-        CH(i,k,5) = (a4-a5).template special_mul<fwd>(WA(4,i));
         CH(i,k,2) = (a2+a3).template special_mul<fwd>(WA(1,i));
         CH(i,k,6) = (a2-a3).template special_mul<fwd>(WA(5,i));
+        ROTX90<fwd>(a6);
+        PMINPLACE(a4,a6);
+        CH(i,k,1) = (a4+a5).template special_mul<fwd>(WA(0,i));
+        CH(i,k,5) = (a4-a5).template special_mul<fwd>(WA(4,i));
         CH(i,k,3) = (a6+a7).template special_mul<fwd>(WA(2,i));
         CH(i,k,7) = (a6-a7).template special_mul<fwd>(WA(6,i));
         }
-- 
GitLab