Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pypocketfft
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
4
Issues
4
List
Boards
Labels
Service Desk
Milestones
Merge Requests
2
Merge Requests
2
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Martin Reinecke
pypocketfft
Commits
985a0bd9
Commit
985a0bd9
authored
Aug 09, 2019
by
Martin Reinecke
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'sincospi' into 'master'
Custom sincospi See merge request
!22
parents
553e4f1c
ab72b08d
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
221 additions
and
45 deletions
+221
-45
pocketfft_hdronly.h
pocketfft_hdronly.h
+175
-45
stress.py
stress.py
+46
-0
No files found.
pocketfft_hdronly.h
View file @
985a0bd9
...
...
@@ -266,50 +266,179 @@ template<bool fwd, typename T> void ROTX90(cmplx<T> &a)
// twiddle factor section
//
/** Approximate sin(pi*a) within the range [-0.25, 0.25] */
inline
float
sinpi0
(
float
a
)
{
// adapted from https://stackoverflow.com/questions/42792939/
float
s
=
a
*
a
;
float
r
=
-
5.957031250000000000e-01
f
;
r
=
fma
(
r
,
s
,
2.550399541854858398e+00
f
);
r
=
fma
(
r
,
s
,
-
5.167724132537841797e+00
f
);
r
=
(
a
*
s
)
*
r
;
return
fma
(
a
,
3.141592741012573242e+00
f
,
r
);
}
/** Approximate sin(pi*a) within the range [-0.25, 0.25] */
inline
double
sinpi0
(
double
a
)
{
// adapted from https://stackoverflow.com/questions/42792939/
double
s
=
a
*
a
;
double
r
=
4.6151442520157035e-4
;
r
=
fma
(
r
,
s
,
-
7.3700183130883555e-3
);
r
=
fma
(
r
,
s
,
8.2145868949323936e-2
);
r
=
fma
(
r
,
s
,
-
5.9926452893214921e-1
);
r
=
fma
(
r
,
s
,
2.5501640398732688e+0
);
r
=
fma
(
r
,
s
,
-
5.1677127800499516e+0
);
s
=
s
*
a
;
r
=
r
*
s
;
return
fma
(
a
,
3.1415926535897931e+0
,
r
);
}
/** Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
inline
float
cosm1pi0
(
float
a
)
{
// adapted from https://stackoverflow.com/questions/42792939/
float
s
=
a
*
a
;
float
r
=
2.313842773437500000e-01
f
;
r
=
fmaf
(
r
,
s
,
-
1.335021972656250000e+00
f
);
r
=
fmaf
(
r
,
s
,
4.058703899383544922e+00
f
);
r
=
fmaf
(
r
,
s
,
-
4.934802055358886719e+00
f
);
return
r
*
s
;
}
/** Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
inline
double
cosm1pi0
(
double
a
)
{
// adapted from https://stackoverflow.com/questions/42792939/
double
s
=
a
*
a
;
double
r
=
-
1.0369917389758117e-4
;
r
=
fma
(
r
,
s
,
1.9294935641298806e-3
);
r
=
fma
(
r
,
s
,
-
2.5806887942825395e-2
);
r
=
fma
(
r
,
s
,
2.3533063028328211e-1
);
r
=
fma
(
r
,
s
,
-
1.3352627688538006e+0
);
r
=
fma
(
r
,
s
,
4.0587121264167623e+0
);
r
=
fma
(
r
,
s
,
-
4.9348022005446790e+0
);
return
r
*
s
;
}
template
<
typename
T
>
void
sincosm1pi0
(
T
a_
,
T
*
POCKETFFT_RESTRICT
res
)
{
if
(
sizeof
(
T
)
>
sizeof
(
double
))
// don't have the code for long double
{
constexpr
T
pi
=
T
(
3.141592653589793238462643383279502884197
L
);
auto
s
=
sin
(
pi
*
a_
);
res
[
1
]
=
s
;
res
[
0
]
=
(
s
*
s
)
/
(
-
sqrt
((
1
-
s
)
*
(
1
+
s
))
-
1
);
return
;
}
res
[
0
]
=
T
(
cosm1pi0
(
double
(
a_
)));
res
[
1
]
=
T
(
sinpi0
(
double
(
a_
)));
}
template
<
typename
T
>
T
sinpi
(
T
a
)
{
// reduce argument to primary approximation interval (-0.25, 0.25)
auto
r
=
nearbyint
(
a
+
a
);
// must use IEEE-754 "to nearest" rounding
auto
i
=
(
int64_t
)
r
;
auto
t
=
fma
(
T
(
-
0.5
),
r
,
a
);
switch
(
i
%
4
)
{
case
0
:
return
sinpi0
(
t
);
case
1
:
case
-
3
:
return
cosm1pi0
(
t
)
+
T
(
1.
);
case
2
:
case
-
2
:
return
T
(
0.
)
-
sinpi0
(
t
);
case
3
:
case
-
1
:
return
T
(
-
1.
)
-
cosm1pi0
(
t
);
}
throw
runtime_error
(
"cannot happen"
);
}
template
<
typename
T
>
T
cospi
(
T
a
)
{
// reduce argument to primary approximation interval (-0.25, 0.25)
auto
r
=
nearbyint
(
a
+
a
);
// must use IEEE-754 "to nearest" rounding
auto
i
=
(
int64_t
)
r
;
auto
t
=
fma
(
T
(
-
0.5
),
r
,
a
);
switch
(
i
%
4
)
{
case
0
:
return
cosm1pi0
(
t
)
+
T
(
1.
);
case
1
:
case
-
3
:
return
T
(
0.
)
-
sinpi0
(
t
);
case
2
:
case
-
2
:
return
T
(
-
1.
)
-
cosm1pi0
(
t
);
case
3
:
case
-
1
:
return
sinpi0
(
t
);
}
throw
runtime_error
(
"cannot happen"
);
}
inline
long
double
cospi
(
long
double
a
)
{
constexpr
auto
pi
=
3.141592653589793238462643383279502884197
L
;
return
sizeof
(
long
double
)
>
sizeof
(
double
)
?
cos
(
a
*
pi
)
:
static_cast
<
long
double
>
(
cospi
(
static_cast
<
double
>
(
a
)));
}
inline
long
double
sinpi
(
long
double
a
)
{
constexpr
auto
pi
=
3.141592653589793238462643383279502884197
L
;
return
sizeof
(
long
double
)
>
sizeof
(
double
)
?
sin
(
a
*
pi
)
:
static_cast
<
long
double
>
(
sinpi
(
static_cast
<
double
>
(
a
)));
}
template
<
typename
T
>
void
sincospi
(
T
a
,
T
*
POCKETFFT_RESTRICT
res
)
{
// reduce argument to primary approximation interval (-0.25, 0.25)
auto
r
=
nearbyint
(
a
+
a
);
// must use IEEE-754 "to nearest" rounding
auto
i
=
(
int64_t
)
r
;
auto
t
=
fma
(
T
(
-
0.5
),
r
,
a
);
auto
c
=
cosm1pi0
(
t
)
+
T
(
1.
);
auto
s
=
sinpi0
(
t
);
// map results according to quadrant
if
(
i
&
2
)
{
s
=
T
(
0.
)
-
s
;
c
=
T
(
0.
)
-
c
;
}
if
(
i
&
1
)
{
swap
(
s
,
c
);
c
=
T
(
0.
)
-
c
;
}
res
[
0
]
=
c
;
res
[
1
]
=
s
;
}
inline
void
sincospi
(
long
double
a
,
long
double
*
POCKETFFT_RESTRICT
res
)
{
if
(
sizeof
(
long
double
)
>
sizeof
(
double
))
{
constexpr
auto
pi
=
3.141592653589793238462643383279502884197
L
;
res
[
0
]
=
cos
(
pi
*
a
);
res
[
1
]
=
sin
(
pi
*
a
);
}
else
{
double
sincos
[
2
];
sincospi
(
static_cast
<
double
>
(
a
),
sincos
);
res
[
0
]
=
static_cast
<
long
double
>
(
sincos
[
0
]);
res
[
1
]
=
static_cast
<
long
double
>
(
sincos
[
1
]);
}
}
template
<
typename
T
>
class
sincos_2pibyn
{
private:
using
Thigh
=
typename
conditional
<
(
sizeof
(
T
)
>
sizeof
(
double
)),
T
,
double
>::
type
;
arr
<
T
>
data
;
void
my_sincosm1pi
(
Thigh
a_
,
Thigh
*
POCKETFFT_RESTRICT
res
)
{
if
(
sizeof
(
Thigh
)
>
sizeof
(
double
))
// don't have the code for long double
{
constexpr
Thigh
pi
=
Thigh
(
3.141592653589793238462643383279502884197
L
);
auto
s
=
sin
(
pi
*
a_
);
res
[
1
]
=
s
;
res
[
0
]
=
(
s
*
s
)
/
(
-
sqrt
((
1
-
s
)
*
(
1
+
s
))
-
1
);
return
;
}
// adapted from https://stackoverflow.com/questions/42792939/
// CAUTION: this function only works for arguments in the range
// [-0.25; 0.25]!
double
a
=
double
(
a_
);
double
s
=
a
*
a
;
/* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
double
r
=
-
1.0369917389758117e-4
;
r
=
fma
(
r
,
s
,
1.9294935641298806e-3
);
r
=
fma
(
r
,
s
,
-
2.5806887942825395e-2
);
r
=
fma
(
r
,
s
,
2.3533063028328211e-1
);
r
=
fma
(
r
,
s
,
-
1.3352627688538006e+0
);
r
=
fma
(
r
,
s
,
4.0587121264167623e+0
);
r
=
fma
(
r
,
s
,
-
4.9348022005446790e+0
);
double
c
=
r
*
s
;
/* Approximate sin(pi*x) for x in [-0.25,0.25] */
r
=
4.6151442520157035e-4
;
r
=
fma
(
r
,
s
,
-
7.3700183130883555e-3
);
r
=
fma
(
r
,
s
,
8.2145868949323936e-2
);
r
=
fma
(
r
,
s
,
-
5.9926452893214921e-1
);
r
=
fma
(
r
,
s
,
2.5501640398732688e+0
);
r
=
fma
(
r
,
s
,
-
5.1677127800499516e+0
);
s
=
s
*
a
;
r
=
r
*
s
;
s
=
fma
(
a
,
3.1415926535897931e+0
,
r
);
res
[
0
]
=
c
;
res
[
1
]
=
s
;
}
POCKETFFT_NOINLINE
void
calc_first_octant
(
size_t
den
,
T
*
POCKETFFT_RESTRICT
res
)
{
...
...
@@ -321,7 +450,7 @@ template<typename T> class sincos_2pibyn
arr
<
Thigh
>
tmp
(
2
*
l1
);
for
(
size_t
i
=
1
;
i
<
l1
;
++
i
)
{
my_sincosm1pi
(
Thigh
(
2
*
i
)
/
Thigh
(
den
),
&
tmp
[
2
*
i
]);
sincosm1pi0
(
Thigh
(
2
*
i
)
/
Thigh
(
den
),
&
tmp
[
2
*
i
]);
res
[
2
*
i
]
=
T
(
tmp
[
2
*
i
]
+
1
);
res
[
2
*
i
+
1
]
=
T
(
tmp
[
2
*
i
+
1
]);
}
...
...
@@ -329,7 +458,7 @@ template<typename T> class sincos_2pibyn
while
(
start
<
n
)
{
Thigh
cs
[
2
];
my_sincosm1pi
((
Thigh
(
2
*
start
))
/
Thigh
(
den
),
cs
);
sincosm1pi0
((
Thigh
(
2
*
start
))
/
Thigh
(
den
),
cs
);
res
[
2
*
start
]
=
T
(
cs
[
0
]
+
1
);
res
[
2
*
start
+
1
]
=
T
(
cs
[
1
]);
size_t
end
=
l1
;
...
...
@@ -2561,9 +2690,9 @@ template<typename T0> class T_dcst23
POCKETFFT_NOINLINE
T_dcst23
(
size_t
length
)
:
fftplan
(
length
),
twiddle
(
length
)
{
const
expr
T0
pi
=
T0
(
3.141592653589793238462643383279502884197
L
);
const
auto
oo2n
=
T0
(
0.5
)
/
T0
(
length
);
for
(
size_t
i
=
0
;
i
<
length
;
++
i
)
twiddle
[
i
]
=
T0
(
cos
(
0.5
*
pi
*
T0
(
i
+
1
)
/
T0
(
length
)
));
twiddle
[
i
]
=
cospi
(
oo2n
*
T0
(
i
+
1
));
}
template
<
typename
T
>
POCKETFFT_NOINLINE
void
exec
(
T
c
[],
T0
fct
,
bool
ortho
,
...
...
@@ -2638,12 +2767,13 @@ template<typename T0> class T_dcst4
rfft
((
N
&
1
)
?
new
pocketfft_r
<
T0
>
(
N
)
:
nullptr
),
C2
((
N
&
1
)
?
0
:
N
/
2
)
{
const
expr
T0
pi
=
T0
(
3.141592653589793238462643383279502884197
L
);
const
auto
oon
=
-
T0
(
1.
)
/
T0
(
N
);
if
((
N
&
1
)
==
0
)
for
(
size_t
i
=
0
;
i
<
N
/
2
;
++
i
)
{
T0
ang
=
-
pi
/
T0
(
N
)
*
(
T0
(
i
)
+
T0
(
0.125
));
C2
[
i
].
Set
(
cos
(
ang
),
sin
(
ang
));
T0
sincos
[
2
];
sincospi
(
oon
*
(
T0
(
i
)
+
T0
(
0.125
)),
sincos
);
C2
[
i
].
Set
(
sincos
[
0
],
sincos
[
1
]);
}
}
...
...
stress.py
View file @
985a0bd9
...
...
@@ -50,6 +50,7 @@ def test(err):
axes
=
axes
[:
nax
]
lastsize
=
shape
[
axes
[
-
1
]]
a
=
np
.
random
.
rand
(
*
shape
)
-
0.5
+
1j
*
np
.
random
.
rand
(
*
shape
)
-
0.5j
a_32
=
a
.
astype
(
np
.
complex64
)
b
=
ifftn
(
fftn
(
a
,
axes
=
axes
,
nthreads
=
nthreads
),
axes
=
axes
,
inorm
=
2
,
nthreads
=
nthreads
)
err
=
update_err
(
err
,
"cmax"
,
_l2error
(
a
,
b
))
...
...
@@ -86,6 +87,51 @@ def test(err):
nthreads
=
nthreads
),
axes
=
axes
,
inorm
=
2
,
nthreads
=
nthreads
)
err
=
update_err
(
err
,
"hmaxf"
,
_l2error
(
a
.
real
.
astype
(
np
.
float32
),
b
))
if
all
(
a
.
shape
[
i
]
>
1
for
i
in
axes
):
b
=
pypocketfft
.
dct
(
pypocketfft
.
dct
(
a
.
real
,
axes
=
axes
,
nthreads
=
nthreads
,
type
=
1
),
axes
=
axes
,
type
=
1
,
nthreads
=
nthreads
,
inorm
=
2
)
err
=
update_err
(
err
,
"c1max"
,
_l2error
(
a
.
real
,
b
))
b
=
pypocketfft
.
dct
(
pypocketfft
.
dct
(
a_32
.
real
,
axes
=
axes
,
nthreads
=
nthreads
,
type
=
1
),
axes
=
axes
,
type
=
1
,
nthreads
=
nthreads
,
inorm
=
2
)
err
=
update_err
(
err
,
"c1maxf"
,
_l2error
(
a_32
.
real
,
b
))
b
=
pypocketfft
.
dct
(
pypocketfft
.
dct
(
a
.
real
,
axes
=
axes
,
nthreads
=
nthreads
,
type
=
2
),
axes
=
axes
,
type
=
3
,
nthreads
=
nthreads
,
inorm
=
2
)
err
=
update_err
(
err
,
"c23max"
,
_l2error
(
a
.
real
,
b
))
b
=
pypocketfft
.
dct
(
pypocketfft
.
dct
(
a_32
.
real
,
axes
=
axes
,
nthreads
=
nthreads
,
type
=
2
),
axes
=
axes
,
type
=
3
,
nthreads
=
nthreads
,
inorm
=
2
)
err
=
update_err
(
err
,
"c23maxf"
,
_l2error
(
a_32
.
real
,
b
))
b
=
pypocketfft
.
dct
(
pypocketfft
.
dct
(
a
.
real
,
axes
=
axes
,
nthreads
=
nthreads
,
type
=
4
),
axes
=
axes
,
type
=
4
,
nthreads
=
nthreads
,
inorm
=
2
)
err
=
update_err
(
err
,
"c4max"
,
_l2error
(
a
.
real
,
b
))
b
=
pypocketfft
.
dct
(
pypocketfft
.
dct
(
a_32
.
real
,
axes
=
axes
,
nthreads
=
nthreads
,
type
=
4
),
axes
=
axes
,
type
=
4
,
nthreads
=
nthreads
,
inorm
=
2
)
err
=
update_err
(
err
,
"c4maxf"
,
_l2error
(
a_32
.
real
,
b
))
b
=
pypocketfft
.
dst
(
pypocketfft
.
dst
(
a_32
.
real
,
axes
=
axes
,
nthreads
=
nthreads
,
type
=
1
),
axes
=
axes
,
type
=
1
,
nthreads
=
nthreads
,
inorm
=
2
)
err
=
update_err
(
err
,
"s1maxf"
,
_l2error
(
a_32
.
real
,
b
))
b
=
pypocketfft
.
dst
(
pypocketfft
.
dst
(
a
.
real
,
axes
=
axes
,
nthreads
=
nthreads
,
type
=
2
),
axes
=
axes
,
type
=
3
,
nthreads
=
nthreads
,
inorm
=
2
)
err
=
update_err
(
err
,
"s23max"
,
_l2error
(
a
.
real
,
b
))
b
=
pypocketfft
.
dst
(
pypocketfft
.
dst
(
a_32
.
real
,
axes
=
axes
,
nthreads
=
nthreads
,
type
=
2
),
axes
=
axes
,
type
=
3
,
nthreads
=
nthreads
,
inorm
=
2
)
err
=
update_err
(
err
,
"s23maxf"
,
_l2error
(
a_32
.
real
,
b
))
b
=
pypocketfft
.
dst
(
pypocketfft
.
dst
(
a
.
real
,
axes
=
axes
,
nthreads
=
nthreads
,
type
=
4
),
axes
=
axes
,
type
=
4
,
nthreads
=
nthreads
,
inorm
=
2
)
err
=
update_err
(
err
,
"s4max"
,
_l2error
(
a
.
real
,
b
))
b
=
pypocketfft
.
dst
(
pypocketfft
.
dst
(
a_32
.
real
,
axes
=
axes
,
nthreads
=
nthreads
,
type
=
4
),
axes
=
axes
,
type
=
4
,
nthreads
=
nthreads
,
inorm
=
2
)
err
=
update_err
(
err
,
"s4maxf"
,
_l2error
(
a_32
.
real
,
b
))
err
=
dict
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment