Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Martin Reinecke
pypocketfft
Commits
a872975f
Commit
a872975f
authored
Apr 19, 2019
by
Martin Reinecke
Browse files
vectorize!
parent
ebd22a9f
Changes
1
Hide whitespace changes
Inline
Side-by-side
pocketfft.cc
View file @
a872975f
...
...
@@ -1109,8 +1109,8 @@ template<typename T> NOINLINE void radf5(size_t ido, size_t l1, const T * restri
#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
template
<
typename
T
>
NOINLINE
void
radfg
(
size_t
ido
,
size_t
ip
,
size_t
l1
,
T
0
*
restrict
cc
,
T
0
*
restrict
ch
,
const
T
*
restrict
wa
,
const
T
*
restrict
csarr
)
T
*
restrict
cc
,
T
*
restrict
ch
,
const
T
0
*
restrict
wa
,
const
T
0
*
restrict
csarr
)
{
const
size_t
cdim
=
ip
;
size_t
ipph
=
(
ip
+
1
)
/
2
;
...
...
@@ -1289,7 +1289,7 @@ template<typename T> NOINLINE void radb3(size_t ido, size_t l1, const T * restri
T
tr2
=
2.
*
CC
(
ido
-
1
,
1
,
k
);
T
cr2
=
CC
(
0
,
0
,
k
)
+
taur
*
tr2
;
CH
(
0
,
k
,
0
)
=
CC
(
0
,
0
,
k
)
+
tr2
;
T
ci3
=
2.
*
taui
*
CC
(
0
,
2
,
k
);
T
ci3
=
T0
(
2.
)
*
taui
*
CC
(
0
,
2
,
k
);
PM
(
CH
(
0
,
k
,
2
),
CH
(
0
,
k
,
1
),
cr2
,
ci3
);
}
if
(
ido
==
1
)
return
;
...
...
@@ -1736,7 +1736,7 @@ template<typename T0> class fftblue
template
<
bool
bwd
,
typename
T
>
NOINLINE
void
fft
(
T
c
[],
T0
fct
)
{
constexpr
int
isign
=
bwd
?
1
:
-
1
;
constexpr
T0
isign
=
bwd
?
1
:
-
1
;
arr
<
T
>
akf
(
2
*
n2
);
/* initialize a_k and FFT it */
...
...
@@ -1812,9 +1812,9 @@ template<typename T0> class fftblue
{
arr
<
T
>
tmp
(
2
*
n
);
tmp
[
0
]
=
c
[
0
];
tmp
[
1
]
=
0.
;
tmp
[
1
]
=
c
[
0
]
*
0.
;
memcpy
(
tmp
.
data
()
+
2
,
c
+
1
,
(
n
-
1
)
*
sizeof
(
T
));
if
((
n
&
1
)
==
0
)
tmp
[
n
+
1
]
=
0.
;
if
((
n
&
1
)
==
0
)
tmp
[
n
+
1
]
=
c
[
0
]
*
0.
;
for
(
size_t
m
=
2
;
m
<
n
;
m
+=
2
)
{
tmp
[
2
*
n
-
m
]
=
tmp
[
m
];
...
...
@@ -1831,7 +1831,7 @@ template<typename T0> class fftblue
for
(
size_t
m
=
0
;
m
<
n
;
++
m
)
{
tmp
[
2
*
m
]
=
c
[
m
];
tmp
[
2
*
m
+
1
]
=
0.
;
tmp
[
2
*
m
+
1
]
=
c
[
m
]
*
0.
;
}
fft
<
false
>
(
tmp
.
data
(),
fct
);
c
[
0
]
=
tmp
[
0
];
...
...
@@ -1946,12 +1946,13 @@ class multi_iter
vector
<
size_t
>
pos
;
size_t
ofs_
,
len
;
int64_t
str
;
int64_t
rem
;
bool
done_
;
public:
multi_iter
(
const
multiarr
&
arr
,
size_t
idim
)
:
pos
(
arr
.
ndim
()
-
1
,
0
),
ofs_
(
0
),
len
(
arr
.
size
(
idim
)),
str
(
arr
.
stride
(
idim
)),
done_
(
false
)
str
(
arr
.
stride
(
idim
)),
rem
(
1
),
done_
(
false
)
{
dim
.
reserve
(
arr
.
ndim
()
-
1
);
for
(
size_t
i
=
0
;
i
<
arr
.
ndim
();
++
i
)
...
...
@@ -1959,10 +1960,12 @@ class multi_iter
{
dim
.
push_back
({
arr
.
size
(
i
),
arr
.
stride
(
i
)});
done_
=
done_
||
(
arr
.
size
(
i
)
==
0
);
rem
*=
arr
.
size
(
i
);
}
}
void
advance
()
{
if
(
--
rem
<=
0
)
{
done_
=
true
;
return
;
}
for
(
int
i
=
pos
.
size
()
-
1
;
i
>=
0
;
--
i
)
{
++
pos
[
i
];
...
...
@@ -1978,8 +1981,10 @@ class multi_iter
size_t
offset
()
const
{
return
ofs_
;
}
size_t
length
()
const
{
return
len
;
}
int64_t
stride
()
const
{
return
str
;
}
int64_t
remaining
()
const
{
return
rem
;
}
};
#if 0
template<typename T> void pocketfft_general_c(int ndim, const size_t *shape,
const int64_t *stride_in, const int64_t *stride_out, int nax,
const size_t *axes, bool forward, const cmplx<T> *data_in,
...
...
@@ -2036,7 +2041,100 @@ template<typename T> void pocketfft_general_c(int ndim, const size_t *shape,
fct = T(1);
}
}
#else
#if defined(__AVX__)
#include <x86intrin.h>
#define HAVE_VECSUPPORT
template
<
typename
T
>
struct
VTYPE
{};
template
<
>
struct
VTYPE
<
double
>
{
using
type
=
__m256d
;
};
template
<
>
struct
VTYPE
<
float
>
{
using
type
=
__m256
;
};
#elif defined(__SSE2__)
#include <x86intrin.h>
#define HAVE_VECSUPPORT
template
<
typename
T
>
struct
VTYPE
{};
template
<
>
struct
VTYPE
<
double
>
{
using
type
=
__m128d
;
};
template
<
>
struct
VTYPE
<
float
>
{
using
type
=
__m128
;
};
#endif
template
<
typename
T
>
void
pocketfft_general_c
(
int
ndim
,
const
size_t
*
shape
,
const
int64_t
*
stride_in
,
const
int64_t
*
stride_out
,
int
nax
,
const
size_t
*
axes
,
bool
forward
,
const
cmplx
<
T
>
*
data_in
,
cmplx
<
T
>
*
data_out
,
T
fct
)
{
// allocate temporary 1D array storage
size_t
tmpsize
=
0
;
for
(
int
iax
=
0
;
iax
<
nax
;
++
iax
)
tmpsize
=
max
(
tmpsize
,
shape
[
axes
[
iax
]]);
#ifdef HAVE_VECSUPPORT
using
vtype
=
typename
VTYPE
<
T
>::
type
;
constexpr
int
vlen
=
sizeof
(
vtype
)
/
sizeof
(
T
);
arr
<
cmplx
<
vtype
>>
xdata
(
tmpsize
);
auto
tdata
=
(
cmplx
<
T
>
*
)
xdata
.
data
();
auto
tdatav
=
xdata
.
data
();
#else
arr
<
cmplx
<
T
>>
xdata
(
tmpsize
);
auto
tdata
=
xdata
.
data
();
#endif
multiarr
a_in
(
ndim
,
shape
,
stride_in
),
a_out
(
ndim
,
shape
,
stride_out
);
unique_ptr
<
pocketfft_c
<
T
>>
plan
;
for
(
int
iax
=
0
;
iax
<
nax
;
++
iax
)
{
int
axis
=
axes
[
iax
];
multi_iter
it_in
(
a_in
,
axis
),
it_out
(
a_out
,
axis
);
if
((
!
plan
)
||
(
it_in
.
length
()
!=
plan
->
length
()))
plan
.
reset
(
new
pocketfft_c
<
T
>
(
it_in
.
length
()));
#ifdef HAVE_VECSUPPORT
while
(
it_in
.
remaining
()
>=
vlen
)
{
size_t
p_i
[
vlen
];
for
(
size_t
i
=
0
;
i
<
vlen
;
++
i
)
{
p_i
[
i
]
=
it_in
.
offset
();
it_in
.
advance
();
}
size_t
p_o
[
vlen
];
for
(
size_t
i
=
0
;
i
<
vlen
;
++
i
)
{
p_o
[
i
]
=
it_out
.
offset
();
it_out
.
advance
();
}
for
(
size_t
i
=
0
;
i
<
it_in
.
length
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
tdatav
[
i
].
r
[
j
]
=
data_in
[
p_i
[
j
]
+
i
*
it_in
.
stride
()].
r
;
tdatav
[
i
].
i
[
j
]
=
data_in
[
p_i
[
j
]
+
i
*
it_in
.
stride
()].
i
;
}
forward
?
plan
->
forward
((
vtype
*
)
tdatav
,
fct
)
:
plan
->
backward
((
vtype
*
)
tdatav
,
fct
);
for
(
size_t
i
=
0
;
i
<
it_out
.
length
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
data_out
[
p_o
[
j
]
+
i
*
it_out
.
stride
()].
r
=
tdatav
[
i
].
r
[
j
];
data_out
[
p_o
[
j
]
+
i
*
it_out
.
stride
()].
i
=
tdatav
[
i
].
i
[
j
];
}
}
#endif
while
(
it_in
.
remaining
()
>
0
)
{
for
(
size_t
i
=
0
;
i
<
it_in
.
length
();
++
i
)
tdata
[
i
]
=
data_in
[
it_in
.
offset
()
+
i
*
it_in
.
stride
()];
forward
?
plan
->
forward
((
T
*
)
tdata
,
fct
)
:
plan
->
backward
((
T
*
)
tdata
,
fct
);
for
(
size_t
i
=
0
;
i
<
it_out
.
length
();
++
i
)
data_out
[
it_out
.
offset
()
+
i
*
it_out
.
stride
()]
=
tdata
[
i
];
it_in
.
advance
();
it_out
.
advance
();
}
// after the first dimension, take data from output array
a_in
=
a_out
;
data_in
=
data_out
;
// factor has been applied, use 1 for remaining axes
fct
=
T
(
1
);
}
}
#endif
template
<
typename
T
>
void
pocketfft_general_hartley
(
int
ndim
,
const
size_t
*
shape
,
const
int64_t
*
stride_in
,
const
int64_t
*
stride_out
,
int
nax
,
const
size_t
*
axes
,
const
T
*
data_in
,
T
*
data_out
,
T
fct
)
...
...
@@ -2044,8 +2142,19 @@ template<typename T> void pocketfft_general_hartley(int ndim, const size_t *shap
// allocate temporary 1D array storage, if necessary
size_t
tmpsize
=
0
;
for
(
int
iax
=
0
;
iax
<
nax
;
++
iax
)
if
(
shape
[
axes
[
iax
]]
>
tmpsize
)
tmpsize
=
shape
[
axes
[
iax
]];
arr
<
T
>
tdata
(
tmpsize
);
tmpsize
=
max
(
tmpsize
,
shape
[
axes
[
iax
]]);
#ifdef HAVE_VECSUPPORT
using
vtype
=
typename
VTYPE
<
T
>::
type
;
constexpr
int
vlen
=
sizeof
(
vtype
)
/
sizeof
(
T
);
arr
<
vtype
>
xdata
(
tmpsize
);
auto
tdata
=
(
T
*
)
xdata
.
data
();
auto
tdatav
=
xdata
.
data
();
#else
arr
<
T
>
xdata
(
tmpsize
);
auto
tdata
=
xdata
.
data
();
#endif
multiarr
a_in
(
ndim
,
shape
,
stride_in
),
a_out
(
ndim
,
shape
,
stride_out
);
unique_ptr
<
pocketfft_r
<
T
>>
plan
;
...
...
@@ -2055,11 +2164,38 @@ template<typename T> void pocketfft_general_hartley(int ndim, const size_t *shap
multi_iter
it_in
(
a_in
,
axis
),
it_out
(
a_out
,
axis
);
if
((
!
plan
)
||
(
it_in
.
length
()
!=
plan
->
length
()))
plan
.
reset
(
new
pocketfft_r
<
T
>
(
it_in
.
length
()));
while
(
!
it_in
.
done
())
#ifdef HAVE_VECSUPPORT
while
(
it_in
.
remaining
()
>=
vlen
)
{
size_t
p_i
[
vlen
];
for
(
size_t
i
=
0
;
i
<
vlen
;
++
i
)
{
p_i
[
i
]
=
it_in
.
offset
();
it_in
.
advance
();
}
size_t
p_o
[
vlen
];
for
(
size_t
i
=
0
;
i
<
vlen
;
++
i
)
{
p_o
[
i
]
=
it_out
.
offset
();
it_out
.
advance
();
}
for
(
size_t
i
=
0
;
i
<
it_in
.
length
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
tdatav
[
i
][
j
]
=
data_in
[
p_i
[
j
]
+
i
*
it_in
.
stride
()];
plan
->
forward
((
vtype
*
)
tdatav
,
fct
);
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
data_out
[
p_o
[
j
]]
=
tdatav
[
0
][
j
];
size_t
i
=
1
,
i1
=
1
,
i2
=
it_out
.
length
()
-
1
;
for
(
i
=
1
;
i
<
it_out
.
length
()
-
1
;
i
+=
2
,
++
i1
,
--
i2
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
data_out
[
p_o
[
j
]
+
i1
*
it_out
.
stride
()]
=
tdatav
[
i
][
j
]
+
tdatav
[
i
+
1
][
j
];
data_out
[
p_o
[
j
]
+
i2
*
it_out
.
stride
()]
=
tdatav
[
i
][
j
]
-
tdatav
[
i
+
1
][
j
];
}
if
(
i
<
it_out
.
length
())
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
data_out
[
p_o
[
j
]
+
i1
*
it_out
.
stride
()]
=
tdatav
[
i
][
j
];
}
#endif
while
(
it_in
.
remaining
()
>
0
)
{
for
(
size_t
i
=
0
;
i
<
it_in
.
length
();
++
i
)
tdata
[
i
]
=
data_in
[
it_in
.
offset
()
+
i
*
it_in
.
stride
()];
plan
->
forward
((
T
*
)
tdata
.
data
()
,
fct
);
plan
->
forward
((
T
*
)
tdata
,
fct
);
// Hartley order
data_out
[
it_out
.
offset
()]
=
tdata
[
0
];
size_t
i
=
1
,
i1
=
1
,
i2
=
it_out
.
length
()
-
1
;
...
...
@@ -2086,18 +2222,67 @@ template<typename T> void pocketfft_general_r2c(int ndim, const size_t *shape,
const
T
*
data_in
,
cmplx
<
T
>
*
data_out
,
T
fct
)
{
// allocate temporary 1D array storage
arr
<
T
>
tdata
(
shape
[
axis
]);
#ifdef HAVE_VECSUPPORT
using
vtype
=
typename
VTYPE
<
T
>::
type
;
constexpr
int
vlen
=
sizeof
(
vtype
)
/
sizeof
(
T
);
arr
<
vtype
>
xdata
(
shape
[
axis
]);
auto
tdata
=
(
T
*
)
xdata
.
data
();
auto
tdatav
=
xdata
.
data
();
#else
arr
<
T
>
xdata
(
shape
[
axis
]);
auto
tdata
=
xdata
.
data
();
#endif
multiarr
a_in
(
ndim
,
shape
,
stride_in
),
a_out
(
ndim
,
shape
,
stride_out
);
pocketfft_r
<
T
>
plan
(
shape
[
axis
]);
multi_iter
it_in
(
a_in
,
axis
),
it_out
(
a_out
,
axis
);
size_t
len
=
shape
[
axis
],
s_i
=
it_in
.
stride
(),
s_o
=
it_out
.
stride
();
while
(
!
it_in
.
done
())
#ifdef HAVE_VECSUPPORT
while
(
it_in
.
remaining
()
>=
vlen
)
{
size_t
p_i
[
vlen
];
for
(
size_t
i
=
0
;
i
<
vlen
;
++
i
)
{
p_i
[
i
]
=
it_in
.
offset
();
it_in
.
advance
();
}
size_t
p_o
[
vlen
];
for
(
size_t
i
=
0
;
i
<
vlen
;
++
i
)
{
p_o
[
i
]
=
it_out
.
offset
();
it_out
.
advance
();
}
for
(
size_t
i
=
0
;
i
<
it_in
.
length
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
tdatav
[
i
][
j
]
=
data_in
[
p_i
[
j
]
+
i
*
it_in
.
stride
()];
plan
.
forward
((
vtype
*
)
tdatav
,
fct
);
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
data_out
[
p_o
[
j
]].
r
=
tdatav
[
0
][
j
];
data_out
[
p_o
[
j
]].
i
=
0.
;
}
size_t
i
;
for
(
i
=
1
;
i
<
len
-
1
;
i
+=
2
)
{
size_t
io
=
(
i
+
1
)
/
2
;
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
data_out
[
p_o
[
j
]
+
io
*
it_out
.
stride
()].
r
=
tdatav
[
i
][
j
];
data_out
[
p_o
[
j
]
+
io
*
it_out
.
stride
()].
i
=
tdatav
[
i
+
1
][
j
];
}
}
if
(
i
<
len
)
{
size_t
io
=
(
i
+
1
)
/
2
;
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
data_out
[
p_o
[
j
]
+
io
*
it_out
.
stride
()].
r
=
tdatav
[
i
][
j
];
data_out
[
p_o
[
j
]
+
io
*
it_out
.
stride
()].
i
=
0.
;
}
}
}
#endif
while
(
it_in
.
remaining
()
>
0
)
{
const
T
*
d_i
=
data_in
+
it_in
.
offset
();
cmplx
<
T
>
*
d_o
=
data_out
+
it_out
.
offset
();
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
tdata
[
i
]
=
d_i
[
i
*
s_i
];
plan
.
forward
(
tdata
.
data
()
,
fct
);
plan
.
forward
(
tdata
,
fct
);
d_o
[
0
].
r
=
tdata
[
0
];
d_o
[
0
].
i
=
0.
;
size_t
i
;
...
...
@@ -2122,11 +2307,54 @@ template<typename T> void pocketfft_general_c2r(int ndim, const size_t *shape_ou
const
cmplx
<
T
>
*
data_in
,
T
*
data_out
,
T
fct
)
{
// allocate temporary 1D array storage
arr
<
T
>
tdata
(
shape_out
[
axis
]);
#ifdef HAVE_VECSUPPORT
using
vtype
=
typename
VTYPE
<
T
>::
type
;
constexpr
int
vlen
=
sizeof
(
vtype
)
/
sizeof
(
T
);
arr
<
vtype
>
xdata
(
shape_out
[
axis
]);
auto
tdata
=
(
T
*
)
xdata
.
data
();
auto
tdatav
=
xdata
.
data
();
#else
arr
<
T
>
xdata
(
shape_out
[
axis
]);
auto
tdata
=
xdata
.
data
();
#endif
multiarr
a_in
(
ndim
,
shape_out
,
stride_in
),
a_out
(
ndim
,
shape_out
,
stride_out
);
pocketfft_r
<
T
>
plan
(
shape_out
[
axis
]);
multi_iter
it_in
(
a_in
,
axis
),
it_out
(
a_out
,
axis
);
size_t
len
=
shape_out
[
axis
],
s_i
=
it_in
.
stride
(),
s_o
=
it_out
.
stride
();
#ifdef HAVE_VECSUPPORT
while
(
it_in
.
remaining
()
>=
vlen
)
{
size_t
p_i
[
vlen
];
for
(
size_t
i
=
0
;
i
<
vlen
;
++
i
)
{
p_i
[
i
]
=
it_in
.
offset
();
it_in
.
advance
();
}
size_t
p_o
[
vlen
];
for
(
size_t
i
=
0
;
i
<
vlen
;
++
i
)
{
p_o
[
i
]
=
it_out
.
offset
();
it_out
.
advance
();
}
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
tdatav
[
0
][
j
]
=
data_in
[
p_i
[
j
]].
r
;
size_t
i
;
for
(
i
=
1
;
i
<
len
-
1
;
i
+=
2
)
{
size_t
ii
=
(
i
+
1
)
/
2
;
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
tdatav
[
i
][
j
]
=
data_in
[
p_i
[
j
]
+
ii
*
s_i
].
r
;
tdatav
[
i
+
1
][
j
]
=
data_in
[
p_i
[
j
]
+
ii
*
s_i
].
i
;
}
}
if
(
i
<
len
)
{
size_t
ii
=
(
i
+
1
)
/
2
;
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
tdatav
[
i
][
j
]
=
data_in
[
p_i
[
j
]
+
ii
*
s_i
].
r
;
}
plan
.
backward
(
tdatav
,
fct
);
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
data_out
[
p_o
[
j
]
+
i
*
s_o
]
=
tdatav
[
i
][
j
];
}
#endif
while
(
!
it_in
.
done
())
{
const
cmplx
<
T
>
*
d_i
=
data_in
+
it_in
.
offset
();
...
...
@@ -2144,7 +2372,7 @@ template<typename T> void pocketfft_general_c2r(int ndim, const size_t *shape_ou
size_t
ii
=
(
i
+
1
)
/
2
;
tdata
[
i
]
=
d_i
[
ii
*
s_i
].
r
;
}
plan
.
backward
(
tdata
.
data
()
,
fct
);
plan
.
backward
(
tdata
,
fct
);
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
d_o
[
i
*
s_o
]
=
tdata
[
i
];
it_in
.
advance
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment