Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Martin Reinecke
ducc
Commits
1c168d5b
Commit
1c168d5b
authored
May 13, 2020
by
Martin Reinecke
Browse files
current state
parent
da0ac67a
Pipeline
#74814
failed with stages
in 11 minutes and 54 seconds
Changes
4
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
pypocketfft/demos/bench.py
View file @
1c168d5b
...
...
@@ -118,25 +118,26 @@ def bench_nd(ndim, nmax, nthr, ntry, tp, funcs, nrepeat, ttl="", filename="",
tmp
=
func
(
a
,
nrepeat
,
nthr
)
res
.
append
(
tmp
[
0
])
output
.
append
(
tmp
[
1
])
print
(
"{0:5.2e}/{1:5.2e} = {2:5.2f} L2 error={3}"
.
format
(
results
[
0
][
n
],
results
[
1
][
n
],
results
[
0
][
n
]
/
results
[
1
][
n
],
_l2error
(
output
[
0
],
output
[
1
])))
results
=
np
.
array
(
results
)
plt
.
title
(
"{}: {}D, {}, max_extent={}"
.
format
(
ttl
,
ndim
,
str
(
tp
),
nmax
))
plt
.
xlabel
(
"time ratio"
)
plt
.
ylabel
(
"counts"
)
plt
.
hist
(
results
[
0
,
:]
/
results
[
1
,
:],
bins
=
"auto"
)
if
filename
!=
""
:
plt
.
savefig
(
filename
)
plt
.
show
()
funcs
=
(
measure_pypocketfft
,
measure_fftw
)
#
print("{0:5.2e}/{1:5.2e} = {2:5.2f} L2 error={3}".format(results[0][n],results[1][n],results[0][n]/results[1][n],_l2error(output[0],output[1])))
#
results = np.array(results)
#
plt.title("{}: {}D, {}, max_extent={}".format(
#
ttl, ndim, str(tp), nmax))
#
plt.xlabel("time ratio")
#
plt.ylabel("counts")
#
plt.hist(results[0, :]/results[1, :], bins="auto")
#
if filename != "":
#
plt.savefig(filename)
#
plt.show()
funcs
=
(
measure_pypocketfft
,)
ttl
=
"pypocketfft/FFTW()"
ntry
=
100
nthr
=
1
nice_sizes
=
True
bench_nd
(
1
,
8192
,
nthr
,
100
,
"c16"
,
funcs
,
10
,
ttl
,
"1d.png"
,
nice_sizes
)
bench_nd
(
2
,
2048
,
nthr
,
100
,
"c16"
,
funcs
,
2
,
ttl
,
"2d.png"
,
nice_sizes
)
bench_nd
(
3
,
256
,
nthr
,
100
,
"c16"
,
funcs
,
2
,
ttl
,
"3d.png"
,
nice_sizes
)
bench_nd
(
1
,
8192
,
nthr
,
100
,
"c8"
,
funcs
,
10
,
ttl
,
"1d_single.png"
,
nice_sizes
)
bench_nd
(
2
,
2048
,
nthr
,
100
,
"c8"
,
funcs
,
2
,
ttl
,
"2d_single.png"
,
nice_sizes
)
bench_nd
(
3
,
256
,
nthr
,
100
,
"c8"
,
funcs
,
2
,
ttl
,
"3d_single.png"
,
nice_sizes
)
#
bench_nd(1, 8192, nthr,
ntry
, "c16", funcs, 10, ttl, "1d.png", nice_sizes)
bench_nd
(
2
,
2048
,
nthr
,
ntry
,
"c16"
,
funcs
,
2
,
ttl
,
"2d.png"
,
nice_sizes
)
#
bench_nd(3, 256, nthr,
ntry
, "c16", funcs, 2, ttl, "3d.png", nice_sizes)
#
bench_nd(1, 8192, nthr,
ntry
, "c8", funcs, 10, ttl, "1d_single.png", nice_sizes)
#
bench_nd(2, 2048, nthr,
ntry
, "c8", funcs, 2, ttl, "2d_single.png", nice_sizes)
#
bench_nd(3, 256, nthr,
ntry
, "c8", funcs, 2, ttl, "3d_single.png", nice_sizes)
src/mr_util/infra/communication.h
View file @
1c168d5b
...
...
@@ -24,6 +24,8 @@
#ifndef MRUTIL_COMMUNICATION_H
#define MRUTIL_COMMUNICATION_H
#define MRUTIL_USE_MPI
#include <vector>
#ifdef MRUTIL_USE_MPI
#include <mpi.h>
...
...
@@ -95,8 +97,7 @@ class Communicator
template
<
typename
T
>
void
sendrecvRaw
(
const
T
*
sendbuf
,
size_t
sendcnt
,
size_t
dest
,
T
*
recvbuf
,
size_t
recvcnt
,
size_t
src
)
const
{
sendrecvRawVoid
(
sendbuf
,
sendcnt
,
dest
,
recvbuf
,
recvcnt
,
src
,
tidx
<
T
>
());
sendrecvRawVoid
(
sendbuf
,
sendcnt
,
dest
,
recvbuf
,
recvcnt
,
src
,
tidx
<
T
>
());
}
template
<
typename
T
>
void
sendrecv_replaceRaw
(
T
*
data
,
size_t
num
,
size_t
dest
,
size_t
src
)
const
...
...
@@ -110,6 +111,12 @@ class Communicator
template
<
typename
T
>
void
allgathervRaw
(
const
T
*
in
,
int
numin
,
T
*
out
,
const
int
*
numout
,
const
int
*
disout
)
const
{
allgathervRawVoid
(
in
,
numin
,
out
,
numout
,
disout
,
tidx
<
T
>
());
}
template
<
typename
T
>
vector
<
T
>
allgatherVec
(
const
T
&
in
)
const
{
vector
<
T
>
res
(
num_ranks_
);
allgatherRaw
(
&
in
,
res
.
data
(),
1
);
return
res
;
}
template
<
typename
T
>
T
allreduce
(
const
T
&
in
,
redOp
op
)
const
{
...
...
@@ -117,13 +124,18 @@ class Communicator
allreduceRaw
(
&
in
,
&
out
,
1
,
op
);
return
out
;
}
template
<
typename
T
>
std
::
vector
<
T
>
allreduce
template
<
typename
T
>
std
::
vector
<
T
>
allreduce
Vec
(
const
std
::
vector
<
T
>
&
in
,
redOp
op
)
const
{
std
::
vector
<
T
>
out
(
in
.
size
());
allreduceRaw
(
in
.
data
(),
out
.
data
(),
in
.
size
(),
op
);
return
out
;
}
template
<
typename
T
>
void
sendrecvVec
(
const
vector
<
T
>
&
sendbuf
,
size_t
dest
,
vector
<
T
>
&
recvbuf
,
size_t
src
)
const
{
sendrecvRaw
(
sendbuf
.
data
(),
sendbuf
.
size
(),
dest
,
recvbuf
.
data
(),
recvbuf
.
size
(),
src
);
}
/*! NB: \a num refers to the <i>total</i> number of items in the arrays;
the individual message size is \a num/num_ranks(). */
template
<
typename
T
>
void
all2allRaw
(
const
T
*
in
,
T
*
out
,
size_t
num
)
const
...
...
src/mr_util/infra/simd.h
View file @
1c168d5b
...
...
@@ -52,6 +52,10 @@ namespace mr {
namespace
detail_simd
{
template
<
typename
T
>
T
myexp
(
T
);
// {return -42;}
template
<
>
inline
double
myexp
(
double
v
)
{
return
std
::
exp
(
v
);}
template
<
>
inline
float
myexp
(
float
v
)
{
return
std
::
exp
(
v
);}
template
<
typename
T
>
constexpr
inline
bool
vectorizable
=
false
;
template
<
>
constexpr
inline
bool
vectorizable
<
float
>
=
true
;
template
<
>
constexpr
inline
bool
vectorizable
<
double
>
=
true
;
...
...
@@ -114,6 +118,13 @@ template<typename T, size_t len> class vtp
vtp
&
operator
*=
(
vtp
other
)
{
v
*=
other
.
v
;
return
*
this
;
}
vtp
&
operator
/=
(
vtp
other
)
{
v
/=
other
.
v
;
return
*
this
;
}
vtp
abs
()
const
{
return
hlp
::
abs
(
v
);
}
template
<
typename
Func
>
vtp
apply
(
Func
func
)
const
{
vtp
res
;
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
res
[
i
]
=
func
(
v
[
i
]);
return
res
;
}
inline
vtp
sqrt
()
const
{
return
hlp
::
sqrt
(
v
);
}
vtp
max
(
const
vtp
&
other
)
const
...
...
@@ -188,6 +199,8 @@ template<typename Op, typename T, size_t len> T reduce(const vtp<T, len> &v, Op
res
=
op
(
res
,
v
[
i
]);
return
res
;
}
template
<
typename
T
,
size_t
len
>
vtp
<
T
,
len
>
exp
(
const
vtp
<
T
,
len
>
&
v
)
{
return
v
.
apply
(
myexp
<
T
>
);
}
template
<
typename
T
>
class
pseudoscalar
{
private:
...
...
@@ -407,6 +420,7 @@ using detail_simd::native_simd;
using
detail_simd
::
reduce
;
using
detail_simd
::
max
;
using
detail_simd
::
abs
;
using
detail_simd
::
exp
;
using
detail_simd
::
sqrt
;
using
detail_simd
::
any_of
;
using
detail_simd
::
none_of
;
...
...
src/mr_util/math/fft.h
View file @
1c168d5b
...
...
@@ -38,7 +38,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef MRUTIL_FFT_H
#define MRUTIL_FFT_H
#include <iostream>
#include "mr_util/math/fft1d.h"
#ifndef POCKETFFT_CACHE_SIZE
...
...
@@ -67,6 +67,7 @@ namespace mr {
namespace
detail_fft
{
using
shape_t
=
fmav_info
::
shape_t
;
using
stride_t
=
fmav_info
::
stride_t
;
constexpr
bool
FORWARD
=
true
,
BACKWARD
=
false
;
...
...
@@ -435,34 +436,79 @@ template<typename T> std::shared_ptr<T> get_plan(size_t length)
template
<
size_t
N
>
class
multi_iter
{
private:
shape_t
pos
;
fmav_info
iarr
,
oarr
;
ptrdiff_t
p_ii
,
p_i
[
N
],
str_i
,
p_oi
,
p_o
[
N
],
str_o
;
size_t
idim
,
rem
;
shape_t
shp
,
pos
;
stride_t
str_i
,
str_o
;
size_t
cshp_i
,
cshp_o
,
rem
;
ptrdiff_t
cstr_i
,
cstr_o
,
sstr_i
,
sstr_o
,
p_ii
,
p_i
[
N
],
p_oi
,
p_o
[
N
];
bool
uni_i
,
uni_o
;
void
advance_i
()
{
for
(
int
i_
=
int
(
pos
.
size
())
-
1
;
i_
>=
0
;
--
i_
)
for
(
size_t
i
=
0
;
i
<
pos
.
size
();
++
i
)
{
auto
i
=
size_t
(
i_
);
if
(
i
==
idim
)
continue
;
p_ii
+=
iarr
.
stride
(
i
);
p_oi
+=
oarr
.
stride
(
i
);
if
(
++
pos
[
i
]
<
iarr
.
shape
(
i
))
p_ii
+=
str_i
[
i
];
p_oi
+=
str_o
[
i
];
if
(
++
pos
[
i
]
<
shp
[
i
])
return
;
pos
[
i
]
=
0
;
p_ii
-=
ptrdiff_t
(
iarr
.
shape
(
i
))
*
iarr
.
stride
(
i
)
;
p_oi
-=
ptrdiff_t
(
oarr
.
shape
(
i
))
*
oarr
.
stride
(
i
)
;
p_ii
-=
ptrdiff_t
(
shp
[
i
])
*
str_i
[
i
]
;
p_oi
-=
ptrdiff_t
(
shp
[
i
])
*
str_o
[
i
]
;
}
}
public:
multi_iter
(
const
fmav_info
&
iarr_
,
const
fmav_info
&
oarr_
,
size_t
idim_
,
size_t
nshares
,
size_t
myshare
)
:
pos
(
iarr_
.
ndim
(),
0
),
iarr
(
iarr_
),
oarr
(
oarr_
),
p_ii
(
0
),
str_i
(
iarr
.
stride
(
idim_
)),
p_oi
(
0
),
str_o
(
oarr
.
stride
(
idim_
)),
idim
(
idim_
),
rem
(
iarr
.
size
()
/
iarr
.
shape
(
idim
))
multi_iter
(
const
fmav_info
&
iarr
,
const
fmav_info
&
oarr
,
size_t
idim
,
size_t
nshares
,
size_t
myshare
)
:
rem
(
iarr
.
size
()
/
iarr
.
shape
(
idim
)),
sstr_i
(
0
),
sstr_o
(
0
),
p_ii
(
0
),
p_oi
(
0
)
{
MR_assert
(
oarr
.
ndim
()
==
iarr
.
ndim
(),
"dimension mismatch"
);
MR_assert
(
iarr
.
ndim
()
>=
1
,
"not enough dimensions"
);
// Sort the extraneous dimensions in order of ascending output stride;
// this should improve overall cache re-use and avoid clashes between
// threads as much as possible.
shape_t
idx
(
iarr
.
ndim
());
std
::
iota
(
idx
.
begin
(),
idx
.
end
(),
0
);
sort
(
idx
.
begin
(),
idx
.
end
(),
[
&
oarr
](
size_t
i1
,
size_t
i2
)
{
return
oarr
.
stride
(
i1
)
<
oarr
.
stride
(
i2
);});
for
(
auto
i
:
idx
)
if
(
i
!=
idim
)
{
pos
.
push_back
(
0
);
MR_assert
(
iarr
.
shape
(
i
)
==
oarr
.
shape
(
i
),
"shape mismatch"
);
shp
.
push_back
(
iarr
.
shape
(
i
));
str_i
.
push_back
(
iarr
.
stride
(
i
));
str_o
.
push_back
(
oarr
.
stride
(
i
));
}
MR_assert
(
idim
<
iarr
.
ndim
(),
"bad active dimension"
);
cstr_i
=
iarr
.
stride
(
idim
);
cstr_o
=
oarr
.
stride
(
idim
);
cshp_i
=
iarr
.
shape
(
idim
);
cshp_o
=
oarr
.
shape
(
idim
);
// collapse unneeded dimensions
bool
done
=
false
;
while
(
!
done
)
{
done
=
true
;
for
(
size_t
i
=
1
;
i
<
shp
.
size
();
++
i
)
if
((
str_i
[
i
]
==
str_i
[
i
-
1
]
*
ptrdiff_t
(
shp
[
i
-
1
]))
&&
(
str_o
[
i
]
==
str_o
[
i
-
1
]
*
ptrdiff_t
(
shp
[
i
-
1
])))
{
shp
[
i
-
1
]
*=
shp
[
i
];
str_i
.
erase
(
str_i
.
begin
()
+
ptrdiff_t
(
i
));
str_o
.
erase
(
str_o
.
begin
()
+
ptrdiff_t
(
i
));
shp
.
erase
(
shp
.
begin
()
+
ptrdiff_t
(
i
));
pos
.
pop_back
();
done
=
false
;
// std::cout << "reduced dims" << std::endl;
}
}
if
(
pos
.
size
()
>
0
)
{
sstr_i
=
str_i
[
0
];
sstr_o
=
str_o
[
0
];
}
if
(
nshares
==
1
)
return
;
if
(
nshares
==
0
)
throw
std
::
runtime_error
(
"can't run with zero threads"
);
if
(
myshare
>=
nshares
)
throw
std
::
runtime_error
(
"impossible share requested"
);
...
...
@@ -473,16 +519,16 @@ template<size_t N> class multi_iter
size_t
todo
=
hi
-
lo
;
size_t
chunk
=
rem
;
for
(
size_t
i
=
0
;
i
<
pos
.
size
();
++
i
)
for
(
size_t
i
2
=
0
,
i
=
pos
.
size
()
-
1
;
i
2
<
pos
.
size
();
++
i2
,
--
i
)
{
if
(
i
==
idim
)
continue
;
chunk
/=
iarr
.
shape
(
i
);
chunk
/=
shp
[
i
];
size_t
n_advance
=
lo
/
chunk
;
pos
[
i
]
+=
n_advance
;
p_ii
+=
ptrdiff_t
(
n_advance
)
*
iarr
.
stride
(
i
)
;
p_oi
+=
ptrdiff_t
(
n_advance
)
*
oarr
.
stride
(
i
)
;
p_ii
+=
ptrdiff_t
(
n_advance
)
*
str_i
[
i
]
;
p_oi
+=
ptrdiff_t
(
n_advance
)
*
str_o
[
i
]
;
lo
-=
n_advance
*
chunk
;
}
MR_assert
(
lo
==
0
,
"must not happen"
);
rem
=
todo
;
}
void
advance
(
size_t
n
)
...
...
@@ -494,16 +540,30 @@ template<size_t N> class multi_iter
p_o
[
i
]
=
p_oi
;
advance_i
();
}
uni_i
=
uni_o
=
true
;
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
// std::cout << (p_i[i]-p_i[i-1]) << " " << sstr_i << std::endl;
uni_i
=
uni_i
&&
(
p_i
[
i
]
-
p_i
[
i
-
1
]
==
sstr_i
);
uni_o
=
uni_o
&&
(
p_o
[
i
]
-
p_o
[
i
-
1
]
==
sstr_o
);
}
// for (size_t i=0; i<n; ++i)
rem
-=
n
;
}
ptrdiff_t
iofs
(
size_t
i
)
const
{
return
p_i
[
0
]
+
ptrdiff_t
(
i
)
*
str_i
;
}
ptrdiff_t
iofs
(
size_t
j
,
size_t
i
)
const
{
return
p_i
[
j
]
+
ptrdiff_t
(
i
)
*
str_i
;
}
ptrdiff_t
oofs
(
size_t
i
)
const
{
return
p_o
[
0
]
+
ptrdiff_t
(
i
)
*
str_o
;
}
ptrdiff_t
oofs
(
size_t
j
,
size_t
i
)
const
{
return
p_o
[
j
]
+
ptrdiff_t
(
i
)
*
str_o
;
}
size_t
length_in
()
const
{
return
iarr
.
shape
(
idim
);
}
size_t
length_out
()
const
{
return
oarr
.
shape
(
idim
);
}
ptrdiff_t
stride_in
()
const
{
return
str_i
;
}
ptrdiff_t
stride_out
()
const
{
return
str_o
;
}
ptrdiff_t
iofs
(
size_t
i
)
const
{
return
p_i
[
0
]
+
ptrdiff_t
(
i
)
*
cstr_i
;
}
ptrdiff_t
iofs
(
size_t
j
,
size_t
i
)
const
{
return
p_i
[
j
]
+
ptrdiff_t
(
i
)
*
cstr_i
;
}
ptrdiff_t
iofs_uni
(
size_t
j
,
size_t
i
)
const
{
return
p_i
[
0
]
+
ptrdiff_t
(
j
)
*
sstr_i
+
ptrdiff_t
(
i
)
*
cstr_i
;
}
ptrdiff_t
oofs
(
size_t
i
)
const
{
return
p_o
[
0
]
+
ptrdiff_t
(
i
)
*
cstr_o
;
}
ptrdiff_t
oofs
(
size_t
j
,
size_t
i
)
const
{
return
p_o
[
j
]
+
ptrdiff_t
(
i
)
*
cstr_o
;
}
ptrdiff_t
oofs_uni
(
size_t
j
,
size_t
i
)
const
{
return
p_o
[
0
]
+
ptrdiff_t
(
j
)
*
sstr_o
+
ptrdiff_t
(
i
)
*
cstr_o
;
}
bool
uniform_i
()
const
{
return
uni_i
;
}
ptrdiff_t
unistride_i
()
const
{
return
sstr_i
;
}
bool
uniform_o
()
const
{
return
uni_o
;
}
ptrdiff_t
unistride_o
()
const
{
return
sstr_o
;
}
size_t
length_in
()
const
{
return
cshp_i
;
}
size_t
length_out
()
const
{
return
cshp_o
;
}
ptrdiff_t
stride_in
()
const
{
return
cstr_i
;
}
ptrdiff_t
stride_out
()
const
{
return
cstr_o
;
}
size_t
remaining
()
const
{
return
rem
;
}
};
...
...
@@ -579,49 +639,106 @@ template<typename T, typename T0> aligned_array<T> alloc_tmp
return
aligned_array
<
T
>
(
tmpsize
);
}
//#define MRFFT_PREFETCH
template
<
typename
T
,
size_t
vlen
>
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
#define MRFFT_PREFETCH
#define MRUTIL_PREFETCH_R(addr) __builtin_prefetch(addr);
#define MRUTIL_PREFETCH_W(addr) __builtin_prefetch(addr,1);
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
const
fmav
<
Cmplx
<
T
>>
&
src
,
Cmplx
<
native_simd
<
T
>>
*
MRUTIL_RESTRICT
dst
)
{
size_t
i
=
0
;
#ifdef MRFFT_PREFETCH
constexpr
size_t
dist
=
16
;
for
(;
i
+
dist
<
it
.
length_in
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
__builtin_prefetch
(
&
src
[
it
.
iofs
(
j
,
i
+
dist
)]);
dst
[
i
].
r
[
j
]
=
src
[
it
.
iofs
(
j
,
i
)].
r
;
dst
[
i
].
i
[
j
]
=
src
[
it
.
iofs
(
j
,
i
)].
i
;
}
#endif
for
(;
i
<
it
.
length_in
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
if
(
it
.
uniform_i
())
{
auto
ptr
=
&
src
[
it
.
iofs_uni
(
0
,
0
)];
auto
jstr
=
it
.
unistride_i
();
auto
istr
=
it
.
stride_in
();
if
(
istr
==
1
)
for
(
size_t
i
=
0
;
i
<
it
.
length_in
();
++
i
)
{
Cmplx
<
native_simd
<
T
>>
stmp
;
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
auto
tmp
=
ptr
[
j
*
jstr
+
i
];
stmp
.
r
[
j
]
=
tmp
.
r
;
stmp
.
i
[
j
]
=
tmp
.
i
;
}
dst
[
i
]
=
stmp
;
}
else
if
(
jstr
==
1
)
for
(
size_t
i
=
0
;
i
<
it
.
length_in
();
++
i
)
{
Cmplx
<
native_simd
<
T
>>
stmp
;
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
auto
tmp
=
ptr
[
j
+
i
*
istr
];
stmp
.
r
[
j
]
=
tmp
.
r
;
stmp
.
i
[
j
]
=
tmp
.
i
;
}
dst
[
i
]
=
stmp
;
}
else
for
(
size_t
i
=
0
;
i
<
it
.
length_in
();
++
i
)
{
Cmplx
<
native_simd
<
T
>>
stmp
;
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
auto
tmp
=
src
[
it
.
iofs_uni
(
j
,
i
)];
stmp
.
r
[
j
]
=
tmp
.
r
;
stmp
.
i
[
j
]
=
tmp
.
i
;
}
dst
[
i
]
=
stmp
;
}
}
else
for
(
size_t
i
=
0
;
i
<
it
.
length_in
();
++
i
)
{
auto
tmp
=
src
[
it
.
iofs
(
j
,
i
)];
dst
[
i
].
r
[
j
]
=
tmp
.
r
;
dst
[
i
].
i
[
j
]
=
tmp
.
i
;
Cmplx
<
native_simd
<
T
>>
stmp
;
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
auto
tmp
=
src
[
it
.
iofs
(
j
,
i
)];
stmp
.
r
[
j
]
=
tmp
.
r
;
stmp
.
i
[
j
]
=
tmp
.
i
;
}
dst
[
i
]
=
stmp
;
}
}
template
<
typename
T
,
size_t
vlen
>
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
const
fmav
<
T
>
&
src
,
native_simd
<
T
>
*
MRUTIL_RESTRICT
dst
)
{
size_t
i
=
0
;
#ifdef MRFFT_PREFETCH
constexpr
size_t
dist
=
16
;
for
(;
i
+
dist
<
it
.
length
_i
n
()
;
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
constexpr
size_t
dist
=
32
;
if
(
it
.
uniform
_i
())
for
(
;
i
+
dist
<
it
.
length_in
()
;
++
i
)
{
__builtin_prefetch
(
&
src
[
it
.
oofs
(
j
,
i
+
dist
)]);
dst
[
i
][
j
]
=
src
[
it
.
iofs
(
j
,
i
)];
native_simd
<
T
>
stmp
;
MRUTIL_PREFETCH_W
(
&
dst
[
i
+
dist
]);
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
MRUTIL_PREFETCH_R
(
&
src
[
it
.
iofs_uni
(
j
,
i
+
dist
)]);
stmp
[
j
]
=
src
[
it
.
iofs_uni
(
j
,
i
)];
}
dst
[
i
]
=
stmp
;
}
else
for
(;
i
+
dist
<
it
.
length_in
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
MRUTIL_PREFETCH_R
(
&
src
[
it
.
iofs
(
j
,
i
+
dist
)]);
MRUTIL_PREFETCH_W
(
&
dst
[
i
+
dist
]);
dst
[
i
][
j
]
=
src
[
it
.
iofs
(
j
,
i
)];
}
#endif
for
(;
i
<
it
.
length_in
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
dst
[
i
][
j
]
=
src
[
it
.
iofs
(
j
,
i
)];
if
(
it
.
uniform_i
())
for
(;
i
<
it
.
length_in
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
dst
[
i
][
j
]
=
src
[
it
.
iofs_uni
(
j
,
i
)];
else
for
(;
i
<
it
.
length_in
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
dst
[
i
][
j
]
=
src
[
it
.
iofs
(
j
,
i
)];
}
template
<
typename
T
,
size_t
vlen
>
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
const
fmav
<
T
>
&
src
,
T
*
MRUTIL_RESTRICT
dst
)
{
if
(
dst
==
&
src
[
it
.
iofs
(
0
)])
return
;
// in-place
...
...
@@ -629,45 +746,71 @@ template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
dst
[
i
]
=
src
[
it
.
iofs
(
i
)];
}
template
<
typename
T
,
size_t
vlen
>
void
copy_output
(
const
multi_iter
<
vlen
>
&
it
,
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_output
(
const
multi_iter
<
vlen
>
&
it
,
const
Cmplx
<
native_simd
<
T
>>
*
MRUTIL_RESTRICT
src
,
fmav
<
Cmplx
<
T
>>
&
dst
)
{
auto
ptr
=
dst
.
vdata
();
size_t
i
=
0
;
#ifdef MRFFT_PREFETCH
constexpr
size_t
dist
=
16
;
for
(;
i
+
dist
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
__builtin_prefetch
(
&
ptr
[
it
.
oofs
(
j
,
i
+
dist
)],
1
,
3
);
ptr
[
it
.
oofs
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
}
constexpr
size_t
dist
=
32
;
if
(
it
.
uniform_o
())
for
(;
i
+
dist
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
MRUTIL_PREFETCH_W
(
&
ptr
[
it
.
oofs_uni
(
j
,
i
+
dist
)]);
ptr
[
it
.
oofs_uni
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
}
else
for
(;
i
+
dist
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
MRUTIL_PREFETCH_W
(
&
ptr
[
it
.
oofs
(
j
,
i
+
dist
)]);
ptr
[
it
.
oofs
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
}
#endif
for
(;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
ptr
[
it
.
oofs
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
if
(
it
.
uniform_o
())
for
(;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
ptr
[
it
.
oofs_uni
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
else
for
(;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
ptr
[
it
.
oofs
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
}
template
<
typename
T
,
size_t
vlen
>
void
copy_output
(
const
multi_iter
<
vlen
>
&
it
,
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_output
(
const
multi_iter
<
vlen
>
&
it
,
const
native_simd
<
T
>
*
MRUTIL_RESTRICT
src
,
fmav
<
T
>
&
dst
)
{
auto
ptr
=
dst
.
vdata
();
size_t
i
=
0
;
#ifdef MRFFT_PREFETCH
constexpr
size_t
dist
=
16
;
for
(;
i
+
dist
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
__builtin_prefetch
(
&
ptr
[
it
.
oofs
(
j
,
i
+
dist
)],
1
,
3
);
ptr
[
it
.
oofs
(
j
,
i
)]
=
src
[
i
][
j
];
}
constexpr
size_t
dist
=
32
;
if
(
it
.
uniform_o
())
for
(;
i
+
dist
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
MRUTIL_PREFETCH_W
(
&
ptr
[
it
.
oofs_uni
(
j
,
i
+
dist
)]);
ptr
[
it
.
oofs_uni
(
j
,
i
)]
=
src
[
i
][
j
];
}
else
for
(;
i
+
dist
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
MRUTIL_PREFETCH_W
(
&
ptr
[
it
.
oofs
(
j
,
i
+
dist
)]);
ptr
[
it
.
oofs
(
j
,
i
)]
=
src
[
i
][
j
];
}
#endif
for
(;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
ptr
[
it
.
oofs
(
j
,
i
)]
=
src
[
i
][
j
];
if
(
it
.
uniform_o
())
for
(;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
ptr
[
it
.
oofs_uni
(
j
,
i
)]
=
src
[
i
][
j
];
else
for
(;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
ptr
[
it
.
oofs
(
j
,
i
)]
=
src
[
i
][
j
];
}
template
<
typename
T
,
size_t
vlen
>
void
copy_output
(
const
multi_iter
<
vlen
>
&
it
,
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_output
(
const
multi_iter
<
vlen
>
&
it
,
const
T
*
MRUTIL_RESTRICT
src
,
fmav
<
T
>
&
dst
)
{
auto
ptr
=
dst
.
vdata
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment