Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Simon Perkins
ducc
Commits
e266167d
Commit
e266167d
authored
May 18, 2020
by
Martin Reinecke
Browse files
slowly return to sanity part 1/n
parent
07b7644b
Changes
4
Hide whitespace changes
Inline
Side-by-side
pypocketfft/demos/bench.py
View file @
e266167d
...
@@ -118,26 +118,26 @@ def bench_nd(ndim, nmax, nthr, ntry, tp, funcs, nrepeat, ttl="", filename="",
...
@@ -118,26 +118,26 @@ def bench_nd(ndim, nmax, nthr, ntry, tp, funcs, nrepeat, ttl="", filename="",
tmp
=
func
(
a
,
nrepeat
,
nthr
)
tmp
=
func
(
a
,
nrepeat
,
nthr
)
res
.
append
(
tmp
[
0
])
res
.
append
(
tmp
[
0
])
output
.
append
(
tmp
[
1
])
output
.
append
(
tmp
[
1
])
#
print("{0:5.2e}/{1:5.2e} = {2:5.2f} L2 error={3}".format(results[0][n],results[1][n],results[0][n]/results[1][n],_l2error(output[0],output[1])))
print
(
"{0:5.2e}/{1:5.2e} = {2:5.2f} L2 error={3}"
.
format
(
results
[
0
][
n
],
results
[
1
][
n
],
results
[
0
][
n
]
/
results
[
1
][
n
],
_l2error
(
output
[
0
],
output
[
1
])))
#
results = np.array(results)
results
=
np
.
array
(
results
)
#
plt.title("{}: {}D, {}, max_extent={}".format(
plt
.
title
(
"{}: {}D, {}, max_extent={}"
.
format
(
#
ttl, ndim, str(tp), nmax))
ttl
,
ndim
,
str
(
tp
),
nmax
))
#
plt.xlabel("time ratio")
plt
.
xlabel
(
"time ratio"
)
#
plt.ylabel("counts")
plt
.
ylabel
(
"counts"
)
#
plt.hist(results[0, :]/results[1, :], bins="auto")
plt
.
hist
(
results
[
0
,
:]
/
results
[
1
,
:],
bins
=
"auto"
)
#
if filename != "":
if
filename
!=
""
:
#
plt.savefig(filename)
plt
.
savefig
(
filename
)
#
plt.show()
plt
.
show
()
funcs
=
(
measure_pypocketfft
,)
funcs
=
(
measure_pypocketfft
,
measure_fftw
)
ttl
=
"pypocketfft/FFTW()"
ttl
=
"pypocketfft/FFTW()"
ntry
=
100
ntry
=
100
nthr
=
1
nthr
=
1
nice_sizes
=
True
nice_sizes
=
True
#
bench_nd(1, 8192, nthr, ntry, "c16", funcs, 10, ttl, "1d.png", nice_sizes)
bench_nd
(
1
,
8192
,
nthr
,
ntry
,
"c16"
,
funcs
,
10
,
ttl
,
"1d.png"
,
nice_sizes
)
bench_nd
(
2
,
2048
,
nthr
,
ntry
,
"c16"
,
funcs
,
2
,
ttl
,
"2d.png"
,
nice_sizes
)
bench_nd
(
2
,
2048
,
nthr
,
ntry
,
"c16"
,
funcs
,
2
,
ttl
,
"2d.png"
,
nice_sizes
)
#
bench_nd(3, 256, nthr, ntry, "c16", funcs, 2, ttl, "3d.png", nice_sizes)
bench_nd
(
3
,
256
,
nthr
,
ntry
,
"c16"
,
funcs
,
2
,
ttl
,
"3d.png"
,
nice_sizes
)
#
bench_nd(1, 8192, nthr, ntry, "c8", funcs, 10, ttl, "1d_single.png", nice_sizes)
bench_nd
(
1
,
8192
,
nthr
,
ntry
,
"c8"
,
funcs
,
10
,
ttl
,
"1d_single.png"
,
nice_sizes
)
#
bench_nd(2, 2048, nthr, ntry, "c8", funcs, 2, ttl, "2d_single.png", nice_sizes)
bench_nd
(
2
,
2048
,
nthr
,
ntry
,
"c8"
,
funcs
,
2
,
ttl
,
"2d_single.png"
,
nice_sizes
)
#
bench_nd(3, 256, nthr, ntry, "c8", funcs, 2, ttl, "3d_single.png", nice_sizes)
bench_nd
(
3
,
256
,
nthr
,
ntry
,
"c8"
,
funcs
,
2
,
ttl
,
"3d_single.png"
,
nice_sizes
)
src/mr_util/infra/simd.h
View file @
e266167d
...
@@ -52,10 +52,6 @@ namespace mr {
...
@@ -52,10 +52,6 @@ namespace mr {
namespace
detail_simd
{
namespace
detail_simd
{
template
<
typename
T
>
T
myexp
(
T
);
// {return -42;}
template
<
>
inline
double
myexp
(
double
v
)
{
return
std
::
exp
(
v
);}
template
<
>
inline
float
myexp
(
float
v
)
{
return
std
::
exp
(
v
);}
template
<
typename
T
>
constexpr
inline
bool
vectorizable
=
false
;
template
<
typename
T
>
constexpr
inline
bool
vectorizable
=
false
;
template
<
>
constexpr
inline
bool
vectorizable
<
float
>
=
true
;
template
<
>
constexpr
inline
bool
vectorizable
<
float
>
=
true
;
template
<
>
constexpr
inline
bool
vectorizable
<
double
>
=
true
;
template
<
>
constexpr
inline
bool
vectorizable
<
double
>
=
true
;
...
@@ -199,8 +195,6 @@ template<typename Op, typename T, size_t len> T reduce(const vtp<T, len> &v, Op
...
@@ -199,8 +195,6 @@ template<typename Op, typename T, size_t len> T reduce(const vtp<T, len> &v, Op
res
=
op
(
res
,
v
[
i
]);
res
=
op
(
res
,
v
[
i
]);
return
res
;
return
res
;
}
}
template
<
typename
T
,
size_t
len
>
vtp
<
T
,
len
>
exp
(
const
vtp
<
T
,
len
>
&
v
)
{
return
v
.
apply
(
myexp
<
T
>
);
}
template
<
typename
T
>
class
pseudoscalar
template
<
typename
T
>
class
pseudoscalar
{
{
private:
private:
...
@@ -420,7 +414,6 @@ using detail_simd::native_simd;
...
@@ -420,7 +414,6 @@ using detail_simd::native_simd;
using
detail_simd
::
reduce
;
using
detail_simd
::
reduce
;
using
detail_simd
::
max
;
using
detail_simd
::
max
;
using
detail_simd
::
abs
;
using
detail_simd
::
abs
;
using
detail_simd
::
exp
;
using
detail_simd
::
sqrt
;
using
detail_simd
::
sqrt
;
using
detail_simd
::
any_of
;
using
detail_simd
::
any_of
;
using
detail_simd
::
none_of
;
using
detail_simd
::
none_of
;
...
...
src/mr_util/infra/useful_macros.h
View file @
e266167d
...
@@ -5,14 +5,20 @@
...
@@ -5,14 +5,20 @@
#define MRUTIL_NOINLINE __attribute__((noinline))
#define MRUTIL_NOINLINE __attribute__((noinline))
#define MRUTIL_RESTRICT __restrict__
#define MRUTIL_RESTRICT __restrict__
#define MRUTIL_ALIGNED(align) __attribute__ ((aligned(align)))
#define MRUTIL_ALIGNED(align) __attribute__ ((aligned(align)))
#define MRUTIL_PREFETCH_R(addr) __builtin_prefetch(addr);
#define MRUTIL_PREFETCH_W(addr) __builtin_prefetch(addr,1);
#elif defined(_MSC_VER)
#elif defined(_MSC_VER)
#define MRUTIL_NOINLINE __declspec(noinline)
#define MRUTIL_NOINLINE __declspec(noinline)
#define MRUTIL_RESTRICT __restrict
#define MRUTIL_RESTRICT __restrict
#define MRUTIL_ALIGNED(align)
#define MRUTIL_ALIGNED(align)
#define MRUTIL_PREFETCH_R(addr)
#define MRUTIL_PREFETCH_W(addr)
#else
#else
#define MRUTIL_NOINLINE
#define MRUTIL_NOINLINE
#define MRUTIL_RESTRICT
#define MRUTIL_RESTRICT
#define MRUTIL_ALIGNED(align)
#define MRUTIL_ALIGNED(align)
#define MRUTIL_PREFETCH_R(addr)
#define MRUTIL_PREFETCH_W(addr)
#endif
#endif
#endif
#endif
src/mr_util/math/fft.h
View file @
e266167d
...
@@ -38,7 +38,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
...
@@ -38,7 +38,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef MRUTIL_FFT_H
#ifndef MRUTIL_FFT_H
#define MRUTIL_FFT_H
#define MRUTIL_FFT_H
#include
<iostream>
#include
"mr_util/math/fft1d.h"
#include
"mr_util/math/fft1d.h"
#ifndef POCKETFFT_CACHE_SIZE
#ifndef POCKETFFT_CACHE_SIZE
...
@@ -500,7 +500,6 @@ template<size_t N> class multi_iter
...
@@ -500,7 +500,6 @@ template<size_t N> class multi_iter
shp
.
erase
(
shp
.
begin
()
+
ptrdiff_t
(
i
));
shp
.
erase
(
shp
.
begin
()
+
ptrdiff_t
(
i
));
pos
.
pop_back
();
pos
.
pop_back
();
done
=
false
;
done
=
false
;
// std::cout << "reduced dims" << std::endl;
}
}
}
}
if
(
pos
.
size
()
>
0
)
if
(
pos
.
size
()
>
0
)
...
@@ -543,11 +542,9 @@ template<size_t N> class multi_iter
...
@@ -543,11 +542,9 @@ template<size_t N> class multi_iter
uni_i
=
uni_o
=
true
;
uni_i
=
uni_o
=
true
;
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
{
// std::cout << (p_i[i]-p_i[i-1]) << " " << sstr_i << std::endl;
uni_i
=
uni_i
&&
(
p_i
[
i
]
-
p_i
[
i
-
1
]
==
sstr_i
);
uni_i
=
uni_i
&&
(
p_i
[
i
]
-
p_i
[
i
-
1
]
==
sstr_i
);
uni_o
=
uni_o
&&
(
p_o
[
i
]
-
p_o
[
i
-
1
]
==
sstr_o
);
uni_o
=
uni_o
&&
(
p_o
[
i
]
-
p_o
[
i
-
1
]
==
sstr_o
);
}
}
// for (size_t i=0; i<n; ++i)
rem
-=
n
;
rem
-=
n
;
}
}
ptrdiff_t
iofs
(
size_t
i
)
const
{
return
p_i
[
0
]
+
ptrdiff_t
(
i
)
*
cstr_i
;
}
ptrdiff_t
iofs
(
size_t
i
)
const
{
return
p_i
[
0
]
+
ptrdiff_t
(
i
)
*
cstr_i
;
}
...
@@ -657,25 +654,6 @@ template <typename T, size_t vlen> MRUTIL_NOINLINE void copy_input_j1(const mult
...
@@ -657,25 +654,6 @@ template <typename T, size_t vlen> MRUTIL_NOINLINE void copy_input_j1(const mult
dst
[
i
]
=
stmp
;
dst
[
i
]
=
stmp
;
}
}
}
}
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_input_j1_a16
(
const
multi_iter
<
vlen
>
&
it
,
const
fmav
<
Cmplx
<
T
>>
&
src
,
Cmplx
<
native_simd
<
T
>>
*
MRUTIL_RESTRICT
dst
)
{
auto
ptr
=
&
src
[
it
.
iofs_uni
(
0
,
0
)];
ptr
=
reinterpret_cast
<
Cmplx
<
T
>
*>
(
__builtin_assume_aligned
(
ptr
,
16
));
auto
istr
=
it
.
stride_in
();
size_t
i
=
0
;
for
(;
i
<
it
.
length_in
();
++
i
)
{
Cmplx
<
native_simd
<
T
>>
stmp
;
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
auto
tmp
=
ptr
[
j
+
i
*
istr
];
stmp
.
r
[
j
]
=
tmp
.
r
;
stmp
.
i
[
j
]
=
tmp
.
i
;
}
dst
[
i
]
=
stmp
;
}
}
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_input_i1
(
const
multi_iter
<
vlen
>
&
it
,
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_input_i1
(
const
multi_iter
<
vlen
>
&
it
,
const
fmav
<
Cmplx
<
T
>>
&
src
,
Cmplx
<
native_simd
<
T
>>
*
MRUTIL_RESTRICT
dst
)
const
fmav
<
Cmplx
<
T
>>
&
src
,
Cmplx
<
native_simd
<
T
>>
*
MRUTIL_RESTRICT
dst
)
{
{
...
@@ -694,9 +672,6 @@ template <typename T, size_t vlen> MRUTIL_NOINLINE void copy_input_i1(const mult
...
@@ -694,9 +672,6 @@ template <typename T, size_t vlen> MRUTIL_NOINLINE void copy_input_i1(const mult
dst
[
i
]
=
stmp
;
dst
[
i
]
=
stmp
;
}
}
}
}
#define MRFFT_PREFETCH
#define MRUTIL_PREFETCH_R(addr) __builtin_prefetch(addr);
#define MRUTIL_PREFETCH_W(addr) __builtin_prefetch(addr,1);
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
const
fmav
<
Cmplx
<
T
>>
&
src
,
Cmplx
<
native_simd
<
T
>>
*
MRUTIL_RESTRICT
dst
)
const
fmav
<
Cmplx
<
T
>>
&
src
,
Cmplx
<
native_simd
<
T
>>
*
MRUTIL_RESTRICT
dst
)
{
{
...
@@ -706,36 +681,29 @@ template <typename T, size_t vlen> MRUTIL_NOINLINE void copy_input(const multi_i
...
@@ -706,36 +681,29 @@ template <typename T, size_t vlen> MRUTIL_NOINLINE void copy_input(const multi_i
auto
jstr
=
it
.
unistride_i
();
auto
jstr
=
it
.
unistride_i
();
auto
istr
=
it
.
stride_in
();
auto
istr
=
it
.
stride_in
();
if
(
istr
==
1
)
if
(
istr
==
1
)
copy_input_i1
(
it
,
src
,
dst
);
for
(
ptrdiff_t
i
=
0
;
i
<
it
.
length_in
();
++
i
)
// for (size_t i=0; i<it.length_in(); ++i)
{
// {
Cmplx
<
native_simd
<
T
>>
stmp
;
// Cmplx<native_simd<T>> stmp;
for
(
ptrdiff_t
j
=
0
;
j
<
vlen
;
++
j
)
// for (size_t j=0; j<vlen; ++j)
{
// {
auto
tmp
=
ptr
[
j
*
jstr
+
i
];
// auto tmp = ptr[j*jstr+i];
stmp
.
r
[
j
]
=
tmp
.
r
;
// stmp.r[j] = tmp.r;
stmp
.
i
[
j
]
=
tmp
.
i
;
// stmp.i[j] = tmp.i;
}
// }
dst
[
i
]
=
stmp
;
// dst[i] = stmp;
}
// }
else
if
(
jstr
==
1
)
else
if
(
jstr
==
1
)
{
for
(
ptrdiff_t
i
=
0
;
i
<
it
.
length_in
();
++
i
)
if
((
reinterpret_cast
<
uintptr_t
>
(
src
.
data
())
&
15
)
==
0
)
{
copy_input_j1_a16
(
it
,
src
,
dst
);
Cmplx
<
native_simd
<
T
>>
stmp
;
else
for
(
ptrdiff_t
j
=
0
;
j
<
vlen
;
++
j
)
copy_input_j1
(
it
,
src
,
dst
);
{
}
auto
tmp
=
ptr
[
j
+
i
*
istr
];
// for (size_t i=0; i<it.length_in(); ++i)
stmp
.
r
[
j
]
=
tmp
.
r
;
// {
stmp
.
i
[
j
]
=
tmp
.
i
;
// Cmplx<native_simd<T>> stmp;
}
// for (size_t j=0; j<vlen; ++j)
dst
[
i
]
=
stmp
;
// {
}
// auto tmp = ptr[j+i*istr];
// stmp.r[j] = tmp.r;
// stmp.i[j] = tmp.i;
// }
// dst[i] = stmp;
// }
else
else
for
(
size_t
i
=
0
;
i
<
it
.
length_in
();
++
i
)
for
(
size_t
i
=
0
;
i
<
it
.
length_in
();
++
i
)
{
{
...
@@ -766,36 +734,12 @@ template <typename T, size_t vlen> MRUTIL_NOINLINE void copy_input(const multi_i
...
@@ -766,36 +734,12 @@ template <typename T, size_t vlen> MRUTIL_NOINLINE void copy_input(const multi_i
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
template
<
typename
T
,
size_t
vlen
>
MRUTIL_NOINLINE
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
const
fmav
<
T
>
&
src
,
native_simd
<
T
>
*
MRUTIL_RESTRICT
dst
)
const
fmav
<
T
>
&
src
,
native_simd
<
T
>
*
MRUTIL_RESTRICT
dst
)
{
{
size_t
i
=
0
;
#ifdef MRFFT_PREFETCH
constexpr
size_t
dist
=
32
;
if
(
it
.
uniform_i
())
if
(
it
.
uniform_i
())
for
(;
i
+
dist
<
it
.
length_in
();
++
i
)
for
(
size_t
i
=
0
;
i
<
it
.
length_in
();
++
i
)
{
native_simd
<
T
>
stmp
;
MRUTIL_PREFETCH_W
(
&
dst
[
i
+
dist
]);
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
MRUTIL_PREFETCH_R
(
&
src
[
it
.
iofs_uni
(
j
,
i
+
dist
)]);
stmp
[
j
]
=
src
[
it
.
iofs_uni
(
j
,
i
)];
}
dst
[
i
]
=
stmp
;
}
else
for
(;
i
+
dist
<
it
.
length_in
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
MRUTIL_PREFETCH_R
(
&
src
[
it
.
iofs
(
j
,
i
+
dist
)]);
MRUTIL_PREFETCH_W
(
&
dst
[
i
+
dist
]);
dst
[
i
][
j
]
=
src
[
it
.
iofs
(
j
,
i
)];
}
#endif
if
(
it
.
uniform_i
())
for
(;
i
<
it
.
length_in
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
dst
[
i
][
j
]
=
src
[
it
.
iofs_uni
(
j
,
i
)];
dst
[
i
][
j
]
=
src
[
it
.
iofs_uni
(
j
,
i
)];
else
else
for
(;
i
<
it
.
length_in
();
++
i
)
for
(
size_t
i
=
0
;
i
<
it
.
length_in
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
dst
[
i
][
j
]
=
src
[
it
.
iofs
(
j
,
i
)];
dst
[
i
][
j
]
=
src
[
it
.
iofs
(
j
,
i
)];
}
}
...
@@ -812,30 +756,12 @@ template<typename T, size_t vlen> MRUTIL_NOINLINE void copy_output(const multi_i
...
@@ -812,30 +756,12 @@ template<typename T, size_t vlen> MRUTIL_NOINLINE void copy_output(const multi_i
const
Cmplx
<
native_simd
<
T
>>
*
MRUTIL_RESTRICT
src
,
fmav
<
Cmplx
<
T
>>
&
dst
)
const
Cmplx
<
native_simd
<
T
>>
*
MRUTIL_RESTRICT
src
,
fmav
<
Cmplx
<
T
>>
&
dst
)
{
{
auto
ptr
=
dst
.
vdata
();
auto
ptr
=
dst
.
vdata
();
size_t
i
=
0
;
#ifdef MRFFT_PREFETCH
constexpr
size_t
dist
=
32
;
if
(
it
.
uniform_o
())
if
(
it
.
uniform_o
())
for
(
;
i
+
dist
<
it
.
length_out
();
++
i
)
for
(
size_t
i
=
0
;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
MRUTIL_PREFETCH_W
(
&
ptr
[
it
.
oofs_uni
(
j
,
i
+
dist
)]);
ptr
[
it
.
oofs_uni
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
ptr
[
it
.
oofs_uni
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
}
else
else
for
(;
i
+
dist
<
it
.
length_out
();
++
i
)
for
(
size_t
i
=
0
;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
MRUTIL_PREFETCH_W
(
&
ptr
[
it
.
oofs
(
j
,
i
+
dist
)]);
ptr
[
it
.
oofs
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
}
#endif
if
(
it
.
uniform_o
())
for
(;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
ptr
[
it
.
oofs_uni
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
else
for
(;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
ptr
[
it
.
oofs
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
ptr
[
it
.
oofs
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
}
}
...
@@ -844,30 +770,12 @@ template<typename T, size_t vlen> MRUTIL_NOINLINE void copy_output(const multi_i
...
@@ -844,30 +770,12 @@ template<typename T, size_t vlen> MRUTIL_NOINLINE void copy_output(const multi_i
const
native_simd
<
T
>
*
MRUTIL_RESTRICT
src
,
fmav
<
T
>
&
dst
)
const
native_simd
<
T
>
*
MRUTIL_RESTRICT
src
,
fmav
<
T
>
&
dst
)
{
{
auto
ptr
=
dst
.
vdata
();
auto
ptr
=
dst
.
vdata
();
size_t
i
=
0
;
#ifdef MRFFT_PREFETCH
constexpr
size_t
dist
=
32
;
if
(
it
.
uniform_o
())
for
(;
i
+
dist
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
MRUTIL_PREFETCH_W
(
&
ptr
[
it
.
oofs_uni
(
j
,
i
+
dist
)]);
ptr
[
it
.
oofs_uni
(
j
,
i
)]
=
src
[
i
][
j
];
}
else
for
(;
i
+
dist
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
MRUTIL_PREFETCH_W
(
&
ptr
[
it
.
oofs
(
j
,
i
+
dist
)]);
ptr
[
it
.
oofs
(
j
,
i
)]
=
src
[
i
][
j
];
}
#endif
if
(
it
.
uniform_o
())
if
(
it
.
uniform_o
())
for
(;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
i
=
0
;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
ptr
[
it
.
oofs_uni
(
j
,
i
)]
=
src
[
i
][
j
];
ptr
[
it
.
oofs_uni
(
j
,
i
)]
=
src
[
i
][
j
];
else
else
for
(;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
i
=
0
;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
ptr
[
it
.
oofs
(
j
,
i
)]
=
src
[
i
][
j
];
ptr
[
it
.
oofs
(
j
,
i
)]
=
src
[
i
][
j
];
}
}
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment