diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 9eeef6324f598352c3beb80e418a2e1cbcaaa8cf..403d8c971422686e0e4b8932522000a7ed529f5b 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -915,74 +915,71 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1, for (size_t k=0; k<l1; ++k) { T a0, a1, a2, a3, a4, a5, a6, a7; - PMC(a0,a4,CC(0,0,k),CC(0,4,k)); PMC(a1,a5,CC(0,1,k),CC(0,5,k)); - PMC(a2,a6,CC(0,2,k),CC(0,6,k)); PMC(a3,a7,CC(0,3,k),CC(0,7,k)); - ROTX90<fwd>(a6); - ROTX90<fwd>(a7); - PMINPLACE(a0,a2); PMINPLACE(a1,a3); - PMINPLACE(a4,a6); + ROTX90<fwd>(a3); + + ROTX90<fwd>(a7); PMINPLACE(a5,a7); ROTX45<fwd>(a5); - ROTX90<fwd>(a3); ROTX135<fwd>(a7); - PMC(CH(0,k,0),CH(0,k,4),a0,a1); - PMC(CH(0,k,1),CH(0,k,5),a4,a5); - PMC(CH(0,k,2),CH(0,k,6),a2,a3); - PMC(CH(0,k,3),CH(0,k,7),a6,a7); + + PMC(a0,a4,CC(0,0,k),CC(0,4,k)); + PMC(a2,a6,CC(0,2,k),CC(0,6,k)); + PMC(CH(0,k,0),CH(0,k,4),a0+a2,a1); + PMC(CH(0,k,2),CH(0,k,6),a0-a2,a3); + ROTX90<fwd>(a6); + PMC(CH(0,k,1),CH(0,k,5),a4+a6,a5); + PMC(CH(0,k,3),CH(0,k,7),a4-a6,a7); } else for (size_t k=0; k<l1; ++k) + { { T a0, a1, a2, a3, a4, a5, a6, a7; - PMC(a0,a4,CC(0,0,k),CC(0,4,k)); PMC(a1,a5,CC(0,1,k),CC(0,5,k)); - PMC(a2,a6,CC(0,2,k),CC(0,6,k)); PMC(a3,a7,CC(0,3,k),CC(0,7,k)); - ROTX90<fwd>(a6); - ROTX90<fwd>(a7); - PMINPLACE(a0,a2); PMINPLACE(a1,a3); - PMINPLACE(a4,a6); + ROTX90<fwd>(a3); + + ROTX90<fwd>(a7); PMINPLACE(a5,a7); ROTX45<fwd>(a5); - ROTX90<fwd>(a3); ROTX135<fwd>(a7); - PMC(CH(0,k,0),CH(0,k,4),a0,a1); - PMC(CH(0,k,1),CH(0,k,5),a4,a5); - PMC(CH(0,k,2),CH(0,k,6),a2,a3); - PMC(CH(0,k,3),CH(0,k,7),a6,a7); + PMC(a0,a4,CC(0,0,k),CC(0,4,k)); + PMC(a2,a6,CC(0,2,k),CC(0,6,k)); + PMC(CH(0,k,0),CH(0,k,4),a0+a2,a1); + PMC(CH(0,k,2),CH(0,k,6),a0-a2,a3); + ROTX90<fwd>(a6); + PMC(CH(0,k,1),CH(0,k,5),a4+a6,a5); + PMC(CH(0,k,3),CH(0,k,7),a4-a6,a7); + } for (size_t i=1; i<ido; ++i) { T a0, a1, a2, a3, a4, a5, a6, a7; - PMC(a0,a4,CC(i,0,k),CC(i,4,k)); PMC(a1,a5,CC(i,1,k),CC(i,5,k)); - PMC(a2,a6,CC(i,2,k),CC(i,6,k)); PMC(a3,a7,CC(i,3,k),CC(i,7,k)); - ROTX90<fwd>(a6); ROTX90<fwd>(a7); - PMINPLACE(a0,a2); PMINPLACE(a1,a3); - PMINPLACE(a4,a6); + ROTX90<fwd>(a3); PMINPLACE(a5,a7); ROTX45<fwd>(a5); - ROTX90<fwd>(a3); ROTX135<fwd>(a7); - PMINPLACE(a0,a1); - PMINPLACE(a2,a3); - PMINPLACE(a4,a5); - PMINPLACE(a6,a7); - CH(i,k,0) = a0; - CH(i,k,1) = a4.template special_mul<fwd>(WA(0,i)); - CH(i,k,2) = a2.template special_mul<fwd>(WA(1,i)); - CH(i,k,3) = a6.template special_mul<fwd>(WA(2,i)); - CH(i,k,4) = a1.template special_mul<fwd>(WA(3,i)); - CH(i,k,5) = a5.template special_mul<fwd>(WA(4,i)); - CH(i,k,6) = a3.template special_mul<fwd>(WA(5,i)); - CH(i,k,7) = a7.template special_mul<fwd>(WA(6,i)); + PMC(a0,a4,CC(i,0,k),CC(i,4,k)); + PMC(a2,a6,CC(i,2,k),CC(i,6,k)); + PMINPLACE(a0,a2); + CH(i,k,0) = a0+a1; + CH(i,k,4) = (a0-a1).template special_mul<fwd>(WA(3,i)); + CH(i,k,2) = (a2+a3).template special_mul<fwd>(WA(1,i)); + CH(i,k,6) = (a2-a3).template special_mul<fwd>(WA(5,i)); + ROTX90<fwd>(a6); + PMINPLACE(a4,a6); + CH(i,k,1) = (a4+a5).template special_mul<fwd>(WA(0,i)); + CH(i,k,5) = (a4-a5).template special_mul<fwd>(WA(4,i)); + CH(i,k,3) = (a6+a7).template special_mul<fwd>(WA(2,i)); + CH(i,k,7) = (a6-a7).template special_mul<fwd>(WA(6,i)); } } }