Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Martin Reinecke
ducc
Commits
f21a205b
Commit
f21a205b
authored
Sep 01, 2020
by
Martin Reinecke
Browse files
more tweaks
parent
4475d484
Pipeline
#81409
passed with stages
in 12 minutes and 20 seconds
Changes
1
Pipelines
1
Show whitespace changes
Inline
Side-by-side
python/gridder_cxx.h
View file @
f21a205b
...
...
@@ -56,8 +56,6 @@ inline complex<float> hsum_cmplx(native_simd<float> vr, native_simd<float> vi)
auto
t2
=
_mm_hadd_ps
(
_mm256_extractf128_ps
(
t1
,
0
),
_mm256_extractf128_ps
(
t1
,
1
));
t2
+=
_mm_shuffle_ps
(
t2
,
t2
,
_MM_SHUFFLE
(
1
,
0
,
3
,
2
));
return
complex
<
float
>
(
t2
[
0
],
t2
[
1
]);
//FIXME perhaps some shuffling?
return
complex
<
float
>
(
t2
[
0
]
+
t2
[
2
],
t2
[
1
]
+
t2
[
3
]);
}
#endif
...
...
@@ -78,9 +76,11 @@ template<typename T> void complex2hartley
MR_assert
(
grid
.
conformable
(
grid2
),
"shape mismatch"
);
size_t
nu
=
grid
.
shape
(
0
),
nv
=
grid
.
shape
(
1
);
exec
Static
(
nu
,
nthreads
,
0
,
[
&
](
Scheduler
&
sched
)
exec
Parallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
while
(
auto
rng
=
sched
.
getNext
())
for
(
auto
u
=
rng
.
lo
;
u
<
rng
.
hi
;
++
u
)
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nu
);
for
(
auto
u
=
lo
;
u
<
hi
;
++
u
)
{
size_t
xu
=
(
u
==
0
)
?
0
:
nu
-
u
;
for
(
size_t
v
=
0
;
v
<
nv
;
++
v
)
...
...
@@ -99,9 +99,11 @@ template<typename T> void hartley2complex
MR_assert
(
grid
.
conformable
(
grid2
),
"shape mismatch"
);
size_t
nu
=
grid
.
shape
(
0
),
nv
=
grid
.
shape
(
1
);
exec
Static
(
nu
,
nthreads
,
0
,
[
&
](
Scheduler
&
sched
)
exec
Parallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
while
(
auto
rng
=
sched
.
getNext
())
for
(
auto
u
=
rng
.
lo
;
u
<
rng
.
hi
;
++
u
)
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nu
);
for
(
auto
u
=
lo
;
u
<
hi
;
++
u
)
{
size_t
xu
=
(
u
==
0
)
?
0
:
nu
-
u
;
for
(
size_t
v
=
0
;
v
<
nv
;
++
v
)
...
...
@@ -133,9 +135,12 @@ template<typename T> void hartley2_2D(mav<T,2> &arr, size_t vlim,
}
else
r2r_separable_hartley
(
farr
,
farr
,
{
0
,
1
},
T
(
1
),
nthreads
);
execStatic
((
nu
+
1
)
/
2
-
1
,
nthreads
,
0
,
[
&
](
Scheduler
&
sched
)
execParallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
while
(
auto
rng
=
sched
.
getNext
())
for
(
auto
i
=
rng
.
lo
+
1
;
i
<
rng
.
hi
+
1
;
++
i
)
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
(
nu
+
1
)
/
2
-
1
);
for
(
auto
i
=
lo
+
1
;
i
<
hi
+
1
;
++
i
)
for
(
size_t
j
=
1
;
j
<
(
nv
+
1
)
/
2
;
++
j
)
{
T
a
=
arr
(
i
,
j
);
...
...
@@ -288,9 +293,11 @@ template<typename T> class Params
checkShape
(
dirty
.
shape
(),
{
nxdirty
,
nydirty
});
auto
cfu
=
krn
->
corfunc
(
nxdirty
/
2
+
1
,
1.
/
nu
,
nthreads
);
auto
cfv
=
krn
->
corfunc
(
nydirty
/
2
+
1
,
1.
/
nv
,
nthreads
);
exec
Static
(
nxdirty
,
nthreads
,
0
,
[
&
](
Scheduler
&
sched
)
exec
Parallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
while
(
auto
rng
=
sched
.
getNext
())
for
(
auto
i
=
rng
.
lo
;
i
<
rng
.
hi
;
++
i
)
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nxdirty
);
for
(
auto
i
=
lo
;
i
<
hi
;
++
i
)
{
int
icfu
=
abs
(
int
(
nxdirty
/
2
)
-
int
(
i
));
for
(
size_t
j
=
0
;
j
<
nydirty
;
++
j
)
...
...
@@ -311,13 +318,15 @@ template<typename T> class Params
checkShape
(
dirty
.
shape
(),
{
nxdirty
,
nydirty
});
double
x0
=
-
0.5
*
nxdirty
*
pixsize_x
,
y0
=
-
0.5
*
nydirty
*
pixsize_y
;
exec
Static
(
nxdirty
/
2
+
1
,
nthreads
,
0
,
[
&
](
Scheduler
&
sched
)
exec
Parallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nxdirty
/
2
+
1
);
using
vtype
=
native_simd
<
T
>
;
constexpr
size_t
vlen
=
vtype
::
size
();
size_t
nvec
=
(
nydirty
/
2
+
1
+
(
vlen
-
1
))
/
vlen
;
vector
<
vtype
>
ph
(
nvec
),
sp
(
nvec
),
cp
(
nvec
);
while
(
auto
rng
=
sched
.
getNext
())
for
(
auto
i
=
rng
.
lo
;
i
<
rng
.
hi
;
++
i
)
for
(
auto
i
=
lo
;
i
<
hi
;
++
i
)
{
T
fx
=
T
(
x0
+
i
*
pixsize_x
);
fx
*=
fx
;
...
...
@@ -398,10 +407,25 @@ template<typename T> class Params
auto
cfu
=
krn
->
corfunc
(
nxdirty
/
2
+
1
,
1.
/
nu
,
nthreads
);
auto
cfv
=
krn
->
corfunc
(
nydirty
/
2
+
1
,
1.
/
nv
,
nthreads
);
// FIXME: maybe we don't have to fill everything and can save some time
grid
.
fill
(
0
);
exec
Static
(
nxdirty
,
nthreads
,
0
,
[
&
](
Scheduler
&
sched
)
//
grid.fill(0);
exec
Parallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
while
(
auto
rng
=
sched
.
getNext
())
for
(
auto
i
=
rng
.
lo
;
i
<
rng
.
hi
;
++
i
)
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nu
);
for
(
auto
i
=
lo
;
i
<
hi
;
++
i
)
{
size_t
lo2
=
0
,
hi2
=
nv
;
if
((
i
<
nxdirty
/
2
)
||
(
i
>=
nu
-
nxdirty
/
2
))
{
lo2
=
nydirty
/
2
;
hi2
=
nv
-
nydirty
/
2
+
1
;
}
for
(
auto
j
=
lo2
;
j
<
hi2
;
++
j
)
grid
.
v
(
i
,
j
)
=
0
;
}
});
execParallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nxdirty
);
for
(
auto
i
=
lo
;
i
<
hi
;
++
i
)
{
int
icfu
=
abs
(
int
(
nxdirty
/
2
)
-
int
(
i
));
for
(
size_t
j
=
0
;
j
<
nydirty
;
++
j
)
...
...
@@ -422,17 +446,32 @@ template<typename T> class Params
checkShape
(
dirty
.
shape
(),
{
nxdirty
,
nydirty
});
checkShape
(
grid
.
shape
(),
{
nu
,
nv
});
// FIXME: maybe we don't have to fill everything and can save some time
grid
.
fill
(
0
);
// grid.fill(0);
execParallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nu
);
for
(
auto
i
=
lo
;
i
<
hi
;
++
i
)
{
size_t
lo2
=
0
,
hi2
=
nv
;
if
((
i
<
nxdirty
/
2
)
||
(
i
>=
nu
-
nxdirty
/
2
))
{
lo2
=
nydirty
/
2
;
hi2
=
nv
-
nydirty
/
2
+
1
;
}
for
(
auto
j
=
lo2
;
j
<
hi2
;
++
j
)
grid
.
v
(
i
,
j
)
=
0
;
}
});
double
x0
=
-
0.5
*
nxdirty
*
pixsize_x
,
y0
=
-
0.5
*
nydirty
*
pixsize_y
;
exec
Static
(
nxdirty
/
2
+
1
,
nthreads
,
0
,
[
&
](
Scheduler
&
sched
)
exec
Parallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nxdirty
/
2
+
1
);
using
vtype
=
native_simd
<
T
>
;
constexpr
size_t
vlen
=
vtype
::
size
();
size_t
nvec
=
(
nydirty
/
2
+
1
+
(
vlen
-
1
))
/
vlen
;
vector
<
vtype
>
ph
(
nvec
),
sp
(
nvec
),
cp
(
nvec
);
while
(
auto
rng
=
sched
.
getNext
())
for
(
auto
i
=
rng
.
lo
;
i
<
rng
.
hi
;
++
i
)
for
(
auto
i
=
lo
;
i
<
hi
;
++
i
)
{
T
fx
=
T
(
x0
+
i
*
pixsize_x
);
fx
*=
fx
;
...
...
@@ -511,7 +550,7 @@ template<typename T> class Params
iv0
=
min
(
int
(
v
+
vshift
)
-
int
(
nv
),
maxiv0
);
}
void
report
()
void
report
()
{
if
(
verbosity
==
0
)
return
;
cout
<<
(
gridding
?
"Gridding"
:
"Degridding"
)
...
...
@@ -527,7 +566,7 @@ void report()
<<
", wmax/dw="
<<
wmax_d
/
dw
<<
", nranges="
<<
ranges
.
size
()
<<
endl
;
}
void
scanData
()
void
scanData
()
{
timers
.
push
(
"Initial scan"
);
size_t
nrow
=
bl
.
Nrows
(),
...
...
@@ -573,7 +612,7 @@ void scanData()
timers
.
pop
();
}
auto
getNuNv
()
auto
getNuNv
()
{
timers
.
push
(
"parameter calculation"
);
double
x0
=
-
0.5
*
nxdirty
*
pixsize_x
,
...
...
@@ -620,7 +659,7 @@ auto getNuNv()
return
minidx
;
}
void
countRanges
()
void
countRanges
()
{
timers
.
push
(
"range count"
);
size_t
nrow
=
bl
.
Nrows
(),
...
...
@@ -707,16 +746,18 @@ void countRanges()
timers
.
pop
();
}
void
apply_global_corrections
(
mav
<
T
,
2
>
&
dirty
)
void
apply_global_corrections
(
mav
<
T
,
2
>
&
dirty
)
{
timers
.
push
(
"global corrections"
);
double
x0
=
-
0.5
*
nxdirty
*
pixsize_x
,
y0
=
-
0.5
*
nydirty
*
pixsize_y
;
auto
cfu
=
krn
->
corfunc
(
nxdirty
/
2
+
1
,
1.
/
nu
,
nthreads
);
auto
cfv
=
krn
->
corfunc
(
nydirty
/
2
+
1
,
1.
/
nv
,
nthreads
);
exec
Static
(
nxdirty
/
2
+
1
,
nthreads
,
0
,
[
&
](
Scheduler
&
sched
)
exec
Parallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
while
(
auto
rng
=
sched
.
getNext
())
for
(
auto
i
=
rng
.
lo
;
i
<
rng
.
hi
;
++
i
)
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nxdirty
/
2
+
1
);
for
(
auto
i
=
lo
;
i
<
hi
;
++
i
)
{
auto
fx
=
T
(
x0
+
i
*
pixsize_x
);
fx
*=
fx
;
...
...
@@ -759,7 +800,8 @@ void apply_global_corrections(mav<T,2> &dirty)
});
timers
.
pop
();
}
template
<
size_t
supp
,
bool
wgrid
>
class
HelperX2g2
template
<
size_t
supp
,
bool
wgrid
>
class
HelperX2g2
{
public:
static
constexpr
size_t
vlen
=
native_simd
<
T
>::
size
();
...
...
@@ -771,7 +813,7 @@ template<size_t supp, bool wgrid> class HelperX2g2
static
constexpr
int
sv
=
2
*
nsafe
+
(
1
<<
logsquare
);
static
constexpr
int
svvec
=
((
sv
+
vlen
-
1
)
/
vlen
)
*
vlen
;
static
constexpr
double
xsupp
=
2.
/
supp
;
const
Params
*
parent
;
const
Params
*
parent
;
TemplateKernel
<
supp
,
T
>
tkrn
;
mav
<
complex
<
T
>
,
2
>
&
grid
;
int
iu0
,
iv0
;
// start index of the current visibility
...
...
@@ -855,7 +897,7 @@ const Params *parent;
};
template
<
size_t
SUPP
,
bool
wgrid
>
[[
gnu
::
hot
]]
void
x2grid_c_helper
template
<
size_t
SUPP
,
bool
wgrid
>
[[
gnu
::
hot
]]
void
x2grid_c_helper
(
mav
<
complex
<
T
>
,
2
>
&
grid
,
size_t
p0
,
double
w0
)
{
...
...
@@ -922,7 +964,7 @@ template<size_t SUPP, bool wgrid> [[gnu::hot]] void x2grid_c_helper
});
}
template
<
bool
wgrid
>
void
x2grid_c
template
<
bool
wgrid
>
void
x2grid_c
(
mav
<
complex
<
T
>
,
2
>
&
grid
,
size_t
p0
,
double
w0
=-
1
)
{
...
...
@@ -960,7 +1002,7 @@ template<bool wgrid> void x2grid_c
timers
.
pop
();
}
void
x2dirty
()
void
x2dirty
()
{
if
(
do_wgridding
)
{
...
...
@@ -973,7 +1015,19 @@ void x2dirty()
{
double
w
=
wmin
+
pl
*
dw
;
timers
.
push
(
"zeroing grid"
);
#if 0
//FIXME: we don't need to zero the entire array here...
execParallel(nthreads, [&](Scheduler &sched)
{
auto tid = sched.thread_num();
auto [lo, hi] = calcShare(nthreads, tid, nu);
for (auto i=lo; i<hi; ++i)
for (size_t j=0; j<nv; ++j)
grid.v(i,j) = 0;
});
#else
grid
.
fill
(
0
);
#endif
timers
.
pop
();
x2grid_c
<
true
>
(
grid
,
pl
,
w
);
grid2dirty_c_overwrite_wscreen_add
(
grid
,
dirty_out
,
T
(
w
));
...
...
@@ -995,7 +1049,7 @@ void x2dirty()
grid2dirty_overwrite
(
rgrid
,
dirty_out
);
}
}
template
<
size_t
supp
,
bool
wgrid
>
class
HelperG2x2
template
<
size_t
supp
,
bool
wgrid
>
class
HelperG2x2
{
public:
static
constexpr
size_t
vlen
=
native_simd
<
T
>::
size
();
...
...
@@ -1007,7 +1061,7 @@ template<size_t supp, bool wgrid> class HelperG2x2
static
constexpr
int
sv
=
2
*
nsafe
+
(
1
<<
logsquare
);
static
constexpr
int
svvec
=
((
sv
+
vlen
-
1
)
/
vlen
)
*
vlen
;
static
constexpr
double
xsupp
=
2.
/
supp
;
const
Params
*
parent
;
const
Params
*
parent
;
TemplateKernel
<
supp
,
T
>
tkrn
;
const
mav
<
complex
<
T
>
,
2
>
&
grid
;
...
...
@@ -1083,7 +1137,7 @@ const Params *parent;
}
};
template
<
size_t
SUPP
,
bool
wgrid
>
[[
gnu
::
hot
]]
void
grid2x_c_helper
template
<
size_t
SUPP
,
bool
wgrid
>
[[
gnu
::
hot
]]
void
grid2x_c_helper
(
const
mav
<
complex
<
T
>
,
2
>
&
grid
,
size_t
p0
,
double
w0
)
{
...
...
@@ -1112,15 +1166,15 @@ template<size_t SUPP, bool wgrid> [[gnu::hot]] void grid2x_c_helper
native_simd
<
T
>
rr
=
0
,
ri
=
0
;
for
(
size_t
cu
=
0
;
cu
<
SUPP
;
++
cu
)
{
// if constexpr(NVEC==1)
// {
// auto fct = kv[0]*ku[cu];
// const auto * DUCC0_RESTRICT pxr = hlp.p0r + cu*jump;
// const auto * DUCC0_RESTRICT pxi = hlp.p0i + cu*jump;
// rr += native_simd<T>::loadu(pxr)*fct;
// ri += native_simd<T>::loadu(pxi)*fct;
// }
// else
// if constexpr(NVEC==1)
// {
// auto fct = kv[0]*ku[cu];
// const auto * DUCC0_RESTRICT pxr = hlp.p0r + cu*jump;
// const auto * DUCC0_RESTRICT pxi = hlp.p0i + cu*jump;
// rr += native_simd<T>::loadu(pxr)*fct;
// ri += native_simd<T>::loadu(pxi)*fct;
// }
// else
{
native_simd
<
T
>
tmpr
(
0
),
tmpi
(
0
);
for
(
size_t
cv
=
0
;
cv
<
NVEC
;
++
cv
)
...
...
@@ -1135,7 +1189,7 @@ template<size_t SUPP, bool wgrid> [[gnu::hot]] void grid2x_c_helper
}
}
auto
r
=
hsum_cmplx
(
rr
,
ri
);
// auto r = complex<T>(reduce(rr, std::plus<>()), reduce(ri, std::plus<>()));
// auto r = complex<T>(reduce(rr, std::plus<>()), reduce(ri, std::plus<>()));
if
(
flip
)
r
=
conj
(
r
);
if
(
have_wgt
)
r
*=
wgt
(
row
,
ch
);
ms_out
.
v
(
row
,
ch
)
+=
r
;
...
...
@@ -1145,7 +1199,7 @@ template<size_t SUPP, bool wgrid> [[gnu::hot]] void grid2x_c_helper
});
}
template
<
bool
wgrid
>
void
grid2x_c
template
<
bool
wgrid
>
void
grid2x_c
(
const
mav
<
complex
<
T
>
,
2
>
&
grid
,
size_t
p0
,
double
w0
=-
1
)
{
...
...
@@ -1183,7 +1237,7 @@ template<bool wgrid> void grid2x_c
timers
.
pop
();
}
void
dirty2x
()
void
dirty2x
()
{
if
(
do_wgridding
)
{
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment