Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
On Thursday, 7th July from 1 to 3 pm there will be a maintenance with a short downtime of GitLab.
Open sidebar
Simon Perkins
ducc
Commits
1c168d5b
Commit
1c168d5b
authored
May 13, 2020
by
Martin Reinecke
Browse files
current state
parent
da0ac67a
Changes
4
Hide whitespace changes
Inline
Side-by-side
pypocketfft/demos/bench.py
View file @
1c168d5b
...
...
@@ -118,25 +118,26 @@ def bench_nd(ndim, nmax, nthr, ntry, tp, funcs, nrepeat, ttl="", filename="",
tmp
=
func
(
a
,
nrepeat
,
nthr
)
res
.
append
(
tmp
[
0
])
output
.
append
(
tmp
[
1
])
print
(
"{0:5.2e}/{1:5.2e} = {2:5.2f} L2 error={3}"
.
format
(
results
[
0
][
n
],
results
[
1
][
n
],
results
[
0
][
n
]
/
results
[
1
][
n
],
_l2error
(
output
[
0
],
output
[
1
])))
results
=
np
.
array
(
results
)
plt
.
title
(
"{}: {}D, {}, max_extent={}"
.
format
(
ttl
,
ndim
,
str
(
tp
),
nmax
))
plt
.
xlabel
(
"time ratio"
)
plt
.
ylabel
(
"counts"
)
plt
.
hist
(
results
[
0
,
:]
/
results
[
1
,
:],
bins
=
"auto"
)
if
filename
!=
""
:
plt
.
savefig
(
filename
)
plt
.
show
()
funcs
=
(
measure_pypocketfft
,
measure_fftw
)
#
print("{0:5.2e}/{1:5.2e} = {2:5.2f} L2 error={3}".format(results[0][n],results[1][n],results[0][n]/results[1][n],_l2error(output[0],output[1])))
#
results = np.array(results)
#
plt.title("{}: {}D, {}, max_extent={}".format(
#
ttl, ndim, str(tp), nmax))
#
plt.xlabel("time ratio")
#
plt.ylabel("counts")
#
plt.hist(results[0, :]/results[1, :], bins="auto")
#
if filename != "":
#
plt.savefig(filename)
#
plt.show()
funcs
=
(
measure_pypocketfft
,)
ttl
=
"pypocketfft/FFTW()"
ntry
=
100
nthr
=
1
nice_sizes
=
True
bench_nd
(
1
,
8192
,
nthr
,
100
,
"c16"
,
funcs
,
10
,
ttl
,
"1d.png"
,
nice_sizes
)
bench_nd
(
2
,
2048
,
nthr
,
100
,
"c16"
,
funcs
,
2
,
ttl
,
"2d.png"
,
nice_sizes
)
bench_nd
(
3
,
256
,
nthr
,
100
,
"c16"
,
funcs
,
2
,
ttl
,
"3d.png"
,
nice_sizes
)
bench_nd
(
1
,
8192
,
nthr
,
100
,
"c8"
,
funcs
,
10
,
ttl
,
"1d_single.png"
,
nice_sizes
)
bench_nd
(
2
,
2048
,
nthr
,
100
,
"c8"
,
funcs
,
2
,
ttl
,
"2d_single.png"
,
nice_sizes
)
bench_nd
(
3
,
256
,
nthr
,
100
,
"c8"
,
funcs
,
2
,
ttl
,
"3d_single.png"
,
nice_sizes
)
#
bench_nd(1, 8192, nthr,
ntry
, "c16", funcs, 10, ttl, "1d.png", nice_sizes)
bench_nd
(
2
,
2048
,
nthr
,
ntry
,
"c16"
,
funcs
,
2
,
ttl
,
"2d.png"
,
nice_sizes
)
#
bench_nd(3, 256, nthr,
ntry
, "c16", funcs, 2, ttl, "3d.png", nice_sizes)
#
bench_nd(1, 8192, nthr,
ntry
, "c8", funcs, 10, ttl, "1d_single.png", nice_sizes)
#
bench_nd(2, 2048, nthr,
ntry
, "c8", funcs, 2, ttl, "2d_single.png", nice_sizes)
#
bench_nd(3, 256, nthr,
ntry
, "c8", funcs, 2, ttl, "3d_single.png", nice_sizes)
src/mr_util/infra/communication.h
View file @
1c168d5b
...
...
@@ -24,6 +24,8 @@
#ifndef MRUTIL_COMMUNICATION_H
#define MRUTIL_COMMUNICATION_H
#define MRUTIL_USE_MPI
#include <vector>
#ifdef MRUTIL_USE_MPI
#include <mpi.h>
...
...
@@ -95,8 +97,7 @@ class Communicator
template
<
typename
T
>
void
sendrecvRaw
(
const
T
*
sendbuf
,
size_t
sendcnt
,
size_t
dest
,
T
*
recvbuf
,
size_t
recvcnt
,
size_t
src
)
const
{
sendrecvRawVoid
(
sendbuf
,
sendcnt
,
dest
,
recvbuf
,
recvcnt
,
src
,
tidx
<
T
>
());
sendrecvRawVoid
(
sendbuf
,
sendcnt
,
dest
,
recvbuf
,
recvcnt
,
src
,
tidx
<
T
>
());
}
template
<
typename
T
>
void
sendrecv_replaceRaw
(
T
*
data
,
size_t
num
,
size_t
dest
,
size_t
src
)
const
...
...
@@ -110,6 +111,12 @@ class Communicator
template
<
typename
T
>
void
allgathervRaw
(
const
T
*
in
,
int
numin
,
T
*
out
,
const
int
*
numout
,
const
int
*
disout
)
const
{
allgathervRawVoid
(
in
,
numin
,
out
,
numout
,
disout
,
tidx
<
T
>
());
}
template
<
typename
T
>
vector
<
T
>
allgatherVec
(
const
T
&
in
)
const
{
vector
<
T
>
res
(
num_ranks_
);
allgatherRaw
(
&
in
,
res
.
data
(),
1
);
return
res
;
}
template
<
typename
T
>
T
allreduce
(
const
T
&
in
,
redOp
op
)
const
{
...
...
@@ -117,13 +124,18 @@ class Communicator
allreduceRaw
(
&
in
,
&
out
,
1
,
op
);
return
out
;
}
template
<
typename
T
>
std
::
vector
<
T
>
allreduce
template
<
typename
T
>
std
::
vector
<
T
>
allreduce
Vec
(
const
std
::
vector
<
T
>
&
in
,
redOp
op
)
const
{
std
::
vector
<
T
>
out
(
in
.
size
());
allreduceRaw
(
in
.
data
(),
out
.
data
(),
in
.
size
(),
op
);
return
out
;
}
template
<
typename
T
>
void
sendrecvVec
(
const
vector
<
T
>
&
sendbuf
,
size_t
dest
,
vector
<
T
>
&
recvbuf
,
size_t
src
)
const
{
sendrecvRaw
(
sendbuf
.
data
(),
sendbuf
.
size
(),
dest
,
recvbuf
.
data
(),
recvbuf
.
size
(),
src
);
}
/*! NB: \a num refers to the <i>total</i> number of items in the arrays;
the individual message size is \a num/num_ranks(). */
template
<
typename
T
>
void
all2allRaw
(
const
T
*
in
,
T
*
out
,
size_t
num
)
const
...
...
src/mr_util/infra/simd.h
View file @
1c168d5b
...
...
@@ -52,6 +52,10 @@ namespace mr {
namespace
detail_simd
{
template
<
typename
T
>
T
myexp
(
T
);
// {return -42;}
template
<
>
inline
double
myexp
(
double
v
)
{
return
std
::
exp
(
v
);}
template
<
>
inline
float
myexp
(
float
v
)
{
return
std
::
exp
(
v
);}
template
<
typename
T
>
constexpr
inline
bool
vectorizable
=
false
;
template
<
>
constexpr
inline
bool
vectorizable
<
float
>
=
true
;
template
<
>
constexpr
inline
bool
vectorizable
<
double
>
=
true
;
...
...
@@ -114,6 +118,13 @@ template<typename T, size_t len> class vtp
vtp
&
operator
*=
(
vtp
other
)
{
v
*=
other
.
v
;
return
*
this
;
}
vtp
&
operator
/=
(
vtp
other
)
{
v
/=
other
.
v
;
return
*
this
;
}
vtp
abs
()
const
{
return
hlp
::
abs
(
v
);
}
template
<
typename
Func
>
vtp
apply
(
Func
func
)
const
{
vtp
res
;
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
res
[
i
]
=
func
(
v
[
i
]);
return
res
;
}
inline
vtp
sqrt
()
const
{
return
hlp
::
sqrt
(
v
);
}
vtp
max
(
const
vtp
&
other
)
const
...
...
@@ -188,6 +199,8 @@ template<typename Op, typename T, size_t len> T reduce(const vtp<T, len> &v, Op
res
=
op
(
res
,
v
[
i
]);
return
res
;
}
template
<
typename
T
,
size_t
len
>
vtp
<
T
,
len
>
exp
(
const
vtp
<
T
,
len
>
&
v
)
{
return
v
.
apply
(
myexp
<
T
>
);
}
template
<
typename
T
>
class
pseudoscalar
{
private:
...
...
@@ -407,6 +420,7 @@ using detail_simd::native_simd;
using
detail_simd
::
reduce
;
using
detail_simd
::
max
;
using
detail_simd
::
abs
;
using
detail_simd
::
exp
;
using
detail_simd
::
sqrt
;
using
detail_simd
::
any_of
;
using
detail_simd
::
none_of
;
...
...
src/mr_util/math/fft.h
View file @
1c168d5b
...
...
@@ -38,7 +38,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef MRUTIL_FFT_H
#define MRUTIL_FFT_H
#include <iostream>
#include "mr_util/math/fft1d.h"
#ifndef POCKETFFT_CACHE_SIZE
...
...
@@ -67,6 +67,7 @@ namespace mr {
namespace
detail_fft
{
using
shape_t
=
fmav_info
::
shape_t
;
using
stride_t
=
fmav_info
::
stride_t
;
constexpr
bool
FORWARD
=
true
,
BACKWARD
=
false
;
...
...
@@ -435,34 +436,79 @@ template<typename T> std::shared_ptr<T> get_plan(size_t length)
template
<
size_t
N
>
class
multi_iter
{
private:
shape_t
pos
;
fmav_info
iarr
,
oarr
;
ptrdiff_t
p_ii
,
p_i
[
N
],
str_i
,
p_oi
,
p_o
[
N
],
str_o
;
size_t
idim
,
rem
;
shape_t
shp
,
pos
;
stride_t
str_i
,
str_o
;
size_t
cshp_i
,
cshp_o
,
rem
;
ptrdiff_t
cstr_i
,
cstr_o
,
sstr_i
,
sstr_o
,
p_ii
,
p_i
[
N
],
p_oi
,
p_o
[
N
];
bool
uni_i
,
uni_o
;
void
advance_i
()
{
for
(
int
i_
=
int
(
pos
.
size
())
-
1
;
i_
>=
0
;
--
i_
)
for
(
size_t
i
=
0
;
i
<
pos
.
size
();
++
i
)
{
auto
i
=
size_t
(
i_
);
if
(
i
==
idim
)
continue
;
p_ii
+=
iarr
.
stride
(
i
);
p_oi
+=
oarr
.
stride
(
i
);
if
(
++
pos
[
i
]
<
iarr
.
shape
(
i
))
p_ii
+=
str_i
[
i
];
p_oi
+=
str_o
[
i
];
if
(
++
pos
[
i
]
<
shp
[
i
])
return
;
pos
[
i
]
=
0
;
p_ii
-=
ptrdiff_t
(
iarr
.
shape
(
i
))
*
iarr
.
stride
(
i
)
;
p_oi
-=
ptrdiff_t
(
oarr
.
shape
(
i
))
*
oarr
.
stride
(
i
)
;
p_ii
-=
ptrdiff_t
(
shp
[
i
])
*
str_i
[
i
]
;
p_oi
-=
ptrdiff_t
(
shp
[
i
])
*
str_o
[
i
]
;
}
}
public:
multi_iter
(
const
fmav_info
&
iarr_
,
const
fmav_info
&
oarr_
,
size_t
idim_
,
size_t
nshares
,
size_t
myshare
)
:
pos
(
iarr_
.
ndim
(),
0
),
iarr
(
iarr_
),
oarr
(
oarr_
),
p_ii
(
0
),
str_i
(
iarr
.
stride
(
idim_
)),
p_oi
(
0
),
str_o
(
oarr
.
stride
(
idim_
)),
idim
(
idim_
),
rem
(
iarr
.
size
()
/
iarr
.
shape
(
idim
))
multi_iter
(
const
fmav_info
&
iarr
,
const
fmav_info
&
oarr
,
size_t
idim
,
size_t
nshares
,
size_t
myshare
)
:
rem
(
iarr
.
size
()
/
iarr
.
shape
(
idim
)),
sstr_i
(
0
),
sstr_o
(
0
),
p_ii
(
0
),
p_oi
(
0
)
{
MR_assert
(
oarr
.
ndim
()
==
iarr
.
ndim
(),
"dimension mismatch"
);
MR_assert
(
iarr
.
ndim
()
>=
1
,
"not enough dimensions"
);
// Sort the extraneous dimensions in order of ascending output stride;
// this should improve overall cache re-use and avoid clashes between
// threads as much as possible.
shape_t
idx
(
iarr
.
ndim
());
std
::
iota
(
idx
.
begin
(),
idx
.
end
(),
0
);
sort
(
idx
.
begin
(),
idx
.
end
(),
[
&
oarr
](
size_t
i1
,
size_t
i2
)
{
return
oarr
.
stride
(
i1
)
<
oarr
.
stride
(
i2
);});
for
(
auto
i
:
idx
)
if
(
i
!=
idim
)
{
pos
.
push_back
(
0
);
MR_assert
(
iarr
.
shape
(
i
)
==
oarr
.
shape
(
i
),
"shape mismatch"
);
shp
.
push_back
(
iarr
.
shape
(
i
));
str_i
.
push_back
(
iarr
.
stride
(
i
));
str_o
.
push_back
(
oarr
.
stride
(
i
));
}
MR_assert
(
idim
<
iarr
.
ndim
(),
"bad active dimension"
);
cstr_i
=
iarr
.
stride
(
idim
);
cstr_o
=
oarr
.
stride
(
idim
);
cshp_i
=
iarr
.
shape
(
idim
);
cshp_o
=
oarr
.
shape
(
idim
);
// collapse unneeded dimensions
bool
done
=
false
;
while
(
!
done
)
{
done
=
true
;
for
(
size_t
i
=
1
;
i
<
shp
.
size
();
++
i
)
if
((
str_i
[
i
]
==
str_i
[
i
-
1
]
*
ptrdiff_t
(
shp
[
i
-
1
]))
&&
(
str_o
[
i
]
==
str_o
[
i
-
1
]
*
ptrdiff_t
(
shp
[
i
-
1
])))
{
shp
[
i
-
1
]
*=
shp
[
i
];
str_i
.
erase
(
str_i
.
begin
()
+
ptrdiff_t
(
i
));
str_o
.
erase
(
str_o
.
begin
()
+
ptrdiff_t
(
i
));
shp
.
erase
(
shp
.
begin
()
+
ptrdiff_t
(
i
));
pos
.
pop_back
();
done
=
false
;
// std::cout << "reduced dims" << std::endl;
}
}
if
(
pos
.
size
()
>
0
)
{
sstr_i
=
str_i
[
0
];
sstr_o
=
str_o
[
0
];
}
if
(
nshares
==
1
)
return
;
if
(
nshares
==
0
)
throw
std
::
runtime_error
(
"can't run with zero threads"
);
if
(
myshare
>=
nshares
)
throw
std
::
runtime_error
(
"impossible share requested"
);
...
...
@@ -473,16 +519,16 @@ template<size_t N> class multi_iter
size_t
todo
=
hi
-
lo
;
size_t
chunk
=
rem
;
for
(
size_t
i
=
0
;
i
<
pos
.
size
();
++
i
)
for
(
size_t
i
2
=
0
,
i
=
pos
.
size
()
-
1
;
i
2
<
pos
.
size
();
++
i2
,
--
i
)
{
if
(
i
==
idim
)
continue
;
chunk
/=
iarr
.
shape
(
i
);
chunk
/=
shp
[
i
];
size_t
n_advance
=
lo
/
chunk
;
pos
[
i
]
+=
n_advance
;
p_ii
+=
ptrdiff_t
(
n_advance
)
*
iarr
.
stride
(
i
)
;
p_oi
+=
ptrdiff_t
(
n_advance
)
*
oarr
.
stride
(
i
)
;
p_ii
+=
ptrdiff_t
(
n_advance
)
*
str_i
[
i
]
;
p_oi
+=
ptrdiff_t
(
n_advance
)
*
str_o
[
i
]
;
lo
-=
n_advance
*
chunk
;
}
MR_assert
(
lo
==
0
,
"must not happen"
);
rem
=
todo
;
}
void
advance
(
size_t
n
)
...
...
@@ -494,16 +540,30 @@ template<size_t N> class multi_iter
p_o
[
i
]
=
p_oi
;
advance_i
();
}
uni_i
=
uni_o
=
true
;
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
// std::cout << (p_i[i]-p_i[i-1]) << " " << sstr_i << std::endl;
uni_i
=
uni_i
&&
(
p_i
[
i
]
-
p_i
[
i
-
1
]
==
sstr_i
);
uni_o
=
uni_o
&&
(
p_o
[
i
]
-
p_o
[
i
-
1
]
==
sstr_o
);
}
// for (size_t i=0; i<n; ++i)
rem
-=
n
;
}
ptrdiff_t
iofs
(
size_t
i
)
const
{
return
p_i
[
0
]
+
ptrdiff_t
(
i
)
*
str_i
;
}
ptrdiff_t
iofs
(
size_t
j
,
size_t
i
)
const
{
return
p_i
[
j
]
+
ptrdiff_t
(
i
)
*
str_i
;
}
ptrdiff_t
oofs
(
size_t
i
)
const
{
return
p_o
[
0
]
+
ptrdiff_t
(
i
)
*
str_o
;
}
ptrdiff_t
oofs
(
size_t
j
,
size_t
i
)
const
{
return
p_o
[
j
]
+
ptrdiff_t
(
i
)
*
str_o
;
}
size_t
length_in
()
const
{
return
iarr
.
shape
(
idim
);
}
size_t
length_out
()
const
{
return
oarr
.
shape
(
idim
);
}
ptrdiff_t
stride_in
()
const
{
return
str_i
;
}
ptrdiff_t
stride_out
()
const
{
return
str_o
;
}
ptrdiff_t
iofs
(
size_t
i
)
const
{
return
p_i
[
0
]
+
ptrdiff_t
(
i
)
*
cstr_i
;
}
ptrdiff_t
iofs
(
size_t
j
,
size_t
i
)
const
{
return
p_i
[
j
]
+
ptrdiff_t
(
i
)
*
cstr_i
;
}
ptrdiff_t
iofs_uni
(
size_t
j
,
size_t
i
)
const
{
return
p_i
[
0
]
+
ptrdiff_t
(
j
)
*
sstr_i
+
ptrdiff_t
(
i
)
*
cstr_i
;
}
ptrdiff_t
oofs
(
size_t
i
)
const
{
return
p_o
[
0
]
+
ptrdiff_t
(
i
)
*
cstr_o
;
}
ptrdiff_t
oofs
(
size_t
j
,
size_t
i
)
const
{
return
p_o
[
j
]
+
ptrdiff_t
(
i
)
*
cstr_o
;
}
ptrdiff_t
oofs_uni
(
size_t
j
,
size_t
i
)
const
{
return
p_o
[
0
]
+
ptrdiff_t
(
j
)
*
sstr_o
+
ptrdiff_t
(
i
)
*
cstr_o
;
}
bool
uniform_i
()
const
{
return
uni_i
;
}
ptrdiff_t
unistride_i
()
const
{
return
sstr_i
;
}
bool
uniform_o
()
const
{
return
uni_o
;
}
ptrdiff_t
unistride_o
()
const
{
return
sstr_o
;
}
size_t
length_in
()
const
{
return
cshp_i
;
}
size_t
length_out
()
const
{
return
cshp_o
;
}
ptrdiff_t
stride_in
()
const
{
return
cstr_i
;
}
ptrdiff_t
stride_out
()
const
{
return
cstr_o
;
}
size_t
remaining
()
const
{
return
rem
;
}
};
...
...
@@ -579,49 +639,106 @@ template<typename T, typename T0> aligned_array<T> alloc_tmp
return
aligned_array
<
T
>
(
tmpsize
);
}
//#define MRFFT_PREFETCH
template
<
typename
T
,
size_t
vlen
>
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
#define MRFFT_PREFETCH
#define MRUTIL_PREFETCH_R(addr) __builtin_prefetch(addr);
#define MRUTIL_PREFETCH_W(addr) __builtin_prefetch(addr,1);
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
const
fmav
<
Cmplx
<
T
>>
&
src
,
Cmplx
<
native_simd
<
T
>>
*
MRUTIL_RESTRICT
dst
)
{
size_t
i
=
0
;
#ifdef MRFFT_PREFETCH
constexpr
size_t
dist
=
16
;
for
(;
i
+
dist
<
it
.
length_in
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
__builtin_prefetch
(
&
src
[
it
.
iofs
(
j
,
i
+
dist
)]);
dst
[
i
].
r
[
j
]
=
src
[
it
.
iofs
(
j
,
i
)].
r
;
dst
[
i
].
i
[
j
]
=
src
[
it
.
iofs
(
j
,
i
)].
i
;
}
#endif
for
(;
i
<
it
.
length_in
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
if
(
it
.
uniform_i
())
{
auto
ptr
=
&
src
[
it
.
iofs_uni
(
0
,
0
)];
auto
jstr
=
it
.
unistride_i
();
auto
istr
=
it
.
stride_in
();
if
(
istr
==
1
)
for
(
size_t
i
=
0
;
i
<
it
.
length_in
();
++
i
)
{
Cmplx
<
native_simd
<
T
>>
stmp
;
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
auto
tmp
=
ptr
[
j
*
jstr
+
i
];
stmp
.
r
[
j
]
=
tmp
.
r
;
stmp
.
i
[
j
]
=
tmp
.
i
;
}
dst
[
i
]
=
stmp
;
}
else
if
(
jstr
==
1
)
for
(
size_t
i
=
0
;
i
<
it
.
length_in
();
++
i
)
{
Cmplx
<
native_simd
<
T
>>
stmp
;
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
auto
tmp
=
ptr
[
j
+
i
*
istr
];
stmp
.
r
[
j
]
=
tmp
.
r
;
stmp
.
i
[
j
]
=
tmp
.
i
;
}
dst
[
i
]
=
stmp
;
}
else
for
(
size_t
i
=
0
;
i
<
it
.
length_in
();
++
i
)
{
Cmplx
<
native_simd
<
T
>>
stmp
;
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
auto
tmp
=
src
[
it
.
iofs_uni
(
j
,
i
)];
stmp
.
r
[
j
]
=
tmp
.
r
;
stmp
.
i
[
j
]
=
tmp
.
i
;
}
dst
[
i
]
=
stmp
;
}
}
else
for
(
size_t
i
=
0
;
i
<
it
.
length_in
();
++
i
)
{
auto
tmp
=
src
[
it
.
iofs
(
j
,
i
)];
dst
[
i
].
r
[
j
]
=
tmp
.
r
;
dst
[
i
].
i
[
j
]
=
tmp
.
i
;
Cmplx
<
native_simd
<
T
>>
stmp
;
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
auto
tmp
=
src
[
it
.
iofs
(
j
,
i
)];
stmp
.
r
[
j
]
=
tmp
.
r
;
stmp
.
i
[
j
]
=
tmp
.
i
;
}
dst
[
i
]
=
stmp
;
}
}
template
<
typename
T
,
size_t
vlen
>
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
const
fmav
<
T
>
&
src
,
native_simd
<
T
>
*
MRUTIL_RESTRICT
dst
)
{
size_t
i
=
0
;
#ifdef MRFFT_PREFETCH
constexpr
size_t
dist
=
16
;
for
(;
i
+
dist
<
it
.
length
_i
n
()
;
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
constexpr
size_t
dist
=
32
;
if
(
it
.
uniform
_i
())
for
(
;
i
+
dist
<
it
.
length_in
()
;
++
i
)
{
__builtin_prefetch
(
&
src
[
it
.
oofs
(
j
,
i
+
dist
)]);
dst
[
i
][
j
]
=
src
[
it
.
iofs
(
j
,
i
)];
native_simd
<
T
>
stmp
;
MRUTIL_PREFETCH_W
(
&
dst
[
i
+
dist
]);
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
MRUTIL_PREFETCH_R
(
&
src
[
it
.
iofs_uni
(
j
,
i
+
dist
)]);
stmp
[
j
]
=
src
[
it
.
iofs_uni
(
j
,
i
)];
}
dst
[
i
]
=
stmp
;
}
else
for
(;
i
+
dist
<
it
.
length_in
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
MRUTIL_PREFETCH_R
(
&
src
[
it
.
iofs
(
j
,
i
+
dist
)]);
MRUTIL_PREFETCH_W
(
&
dst
[
i
+
dist
]);
dst
[
i
][
j
]
=
src
[
it
.
iofs
(
j
,
i
)];
}
#endif
for
(;
i
<
it
.
length_in
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
dst
[
i
][
j
]
=
src
[
it
.
iofs
(
j
,
i
)];
if
(
it
.
uniform_i
())
for
(;
i
<
it
.
length_in
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
dst
[
i
][
j
]
=
src
[
it
.
iofs_uni
(
j
,
i
)];
else
for
(;
i
<
it
.
length_in
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
dst
[
i
][
j
]
=
src
[
it
.
iofs
(
j
,
i
)];
}
template
<
typename
T
,
size_t
vlen
>
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
const
fmav
<
T
>
&
src
,
T
*
MRUTIL_RESTRICT
dst
)
{
if
(
dst
==
&
src
[
it
.
iofs
(
0
)])
return
;
// in-place
...
...
@@ -629,45 +746,71 @@ template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
dst
[
i
]
=
src
[
it
.
iofs
(
i
)];
}
template
<
typename
T
,
size_t
vlen
>
void
copy_output
(
const
multi_iter
<
vlen
>
&
it
,
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_output
(
const
multi_iter
<
vlen
>
&
it
,
const
Cmplx
<
native_simd
<
T
>>
*
MRUTIL_RESTRICT
src
,
fmav
<
Cmplx
<
T
>>
&
dst
)
{
auto
ptr
=
dst
.
vdata
();
size_t
i
=
0
;
#ifdef MRFFT_PREFETCH
constexpr
size_t
dist
=
16
;
for
(;
i
+
dist
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
__builtin_prefetch
(
&
ptr
[
it
.
oofs
(
j
,
i
+
dist
)],
1
,
3
);
ptr
[
it
.
oofs
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
}
constexpr
size_t
dist
=
32
;
if
(
it
.
uniform_o
())
for
(;
i
+
dist
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
MRUTIL_PREFETCH_W
(
&
ptr
[
it
.
oofs_uni
(
j
,
i
+
dist
)]);
ptr
[
it
.
oofs_uni
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
}
else
for
(;
i
+
dist
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
MRUTIL_PREFETCH_W
(
&
ptr
[
it
.
oofs
(
j
,
i
+
dist
)]);
ptr
[
it
.
oofs
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
}
#endif
for
(;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
ptr
[
it
.
oofs
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
if
(
it
.
uniform_o
())
for
(;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
ptr
[
it
.
oofs_uni
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
else
for
(;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
ptr
[
it
.
oofs
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
}
template
<
typename
T
,
size_t
vlen
>
void
copy_output
(
const
multi_iter
<
vlen
>
&
it
,
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_output
(
const
multi_iter
<
vlen
>
&
it
,
const
native_simd
<
T
>
*
MRUTIL_RESTRICT
src
,
fmav
<
T
>
&
dst
)
{
auto
ptr
=
dst
.
vdata
();
size_t
i
=
0
;
#ifdef MRFFT_PREFETCH
constexpr
size_t
dist
=
16
;
for
(;
i
+
dist
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
__builtin_prefetch
(
&
ptr
[
it
.
oofs
(
j
,
i
+
dist
)],
1
,
3
);
ptr
[
it
.
oofs
(
j
,
i
)]
=
src
[
i
][
j
];
}
constexpr
size_t
dist
=
32
;
if
(
it
.
uniform_o
())
for
(;
i
+
dist
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
MRUTIL_PREFETCH_W
(
&
ptr
[
it
.
oofs_uni
(
j
,
i
+
dist
)]);
ptr
[
it
.
oofs_uni
(
j
,
i
)]
=
src
[
i
][
j
];
}
else
for
(;
i
+
dist
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
MRUTIL_PREFETCH_W
(
&
ptr
[
it
.
oofs
(
j
,
i
+
dist
)]);
ptr
[
it
.
oofs
(
j
,
i
)]
=
src
[
i
][
j
];
}
#endif
for
(;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
ptr
[
it
.
oofs
(
j
,
i
)]
=
src
[
i
][
j
];
if
(
it
.
uniform_o
())
for
(;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
ptr
[
it
.
oofs_uni
(
j
,
i
)]
=
src
[
i
][
j
];
else
for
(;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
ptr
[
it
.
oofs
(
j
,
i
)]
=
src
[
i
][
j
];
}
template
<
typename
T
,
size_t
vlen
>
void
copy_output
(
const
multi_iter
<
vlen
>
&
it
,
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_output
(
const
multi_iter
<
vlen
>
&
it
,
const
T
*
MRUTIL_RESTRICT
src
,
fmav
<
T
>
&
dst
)
{
auto
ptr
=
dst
.
vdata
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment