Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Martin Reinecke
ducc
Commits
f21a205b
Commit
f21a205b
authored
Sep 01, 2020
by
Martin Reinecke
Browse files
more tweaks
parent
4475d484
Pipeline
#81409
passed with stages
in 12 minutes and 20 seconds
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
python/gridder_cxx.h
View file @
f21a205b
...
...
@@ -56,8 +56,6 @@ inline complex<float> hsum_cmplx(native_simd<float> vr, native_simd<float> vi)
auto
t2
=
_mm_hadd_ps
(
_mm256_extractf128_ps
(
t1
,
0
),
_mm256_extractf128_ps
(
t1
,
1
));
t2
+=
_mm_shuffle_ps
(
t2
,
t2
,
_MM_SHUFFLE
(
1
,
0
,
3
,
2
));
return
complex
<
float
>
(
t2
[
0
],
t2
[
1
]);
//FIXME perhaps some shuffling?
return
complex
<
float
>
(
t2
[
0
]
+
t2
[
2
],
t2
[
1
]
+
t2
[
3
]);
}
#endif
...
...
@@ -78,9 +76,11 @@ template<typename T> void complex2hartley
MR_assert
(
grid
.
conformable
(
grid2
),
"shape mismatch"
);
size_t
nu
=
grid
.
shape
(
0
),
nv
=
grid
.
shape
(
1
);
exec
Static
(
nu
,
nthreads
,
0
,
[
&
](
Scheduler
&
sched
)
exec
Parallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
while
(
auto
rng
=
sched
.
getNext
())
for
(
auto
u
=
rng
.
lo
;
u
<
rng
.
hi
;
++
u
)
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nu
);
for
(
auto
u
=
lo
;
u
<
hi
;
++
u
)
{
size_t
xu
=
(
u
==
0
)
?
0
:
nu
-
u
;
for
(
size_t
v
=
0
;
v
<
nv
;
++
v
)
...
...
@@ -99,9 +99,11 @@ template<typename T> void hartley2complex
MR_assert
(
grid
.
conformable
(
grid2
),
"shape mismatch"
);
size_t
nu
=
grid
.
shape
(
0
),
nv
=
grid
.
shape
(
1
);
exec
Static
(
nu
,
nthreads
,
0
,
[
&
](
Scheduler
&
sched
)
exec
Parallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
while
(
auto
rng
=
sched
.
getNext
())
for
(
auto
u
=
rng
.
lo
;
u
<
rng
.
hi
;
++
u
)
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nu
);
for
(
auto
u
=
lo
;
u
<
hi
;
++
u
)
{
size_t
xu
=
(
u
==
0
)
?
0
:
nu
-
u
;
for
(
size_t
v
=
0
;
v
<
nv
;
++
v
)
...
...
@@ -133,9 +135,12 @@ template<typename T> void hartley2_2D(mav<T,2> &arr, size_t vlim,
}
else
r2r_separable_hartley
(
farr
,
farr
,
{
0
,
1
},
T
(
1
),
nthreads
);
execStatic
((
nu
+
1
)
/
2
-
1
,
nthreads
,
0
,
[
&
](
Scheduler
&
sched
)
execParallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
while
(
auto
rng
=
sched
.
getNext
())
for
(
auto
i
=
rng
.
lo
+
1
;
i
<
rng
.
hi
+
1
;
++
i
)
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
(
nu
+
1
)
/
2
-
1
);
for
(
auto
i
=
lo
+
1
;
i
<
hi
+
1
;
++
i
)
for
(
size_t
j
=
1
;
j
<
(
nv
+
1
)
/
2
;
++
j
)
{
T
a
=
arr
(
i
,
j
);
...
...
@@ -235,30 +240,30 @@ constexpr int logsquare=4;
template
<
typename
T
>
class
Params
{
private:
bool
gridding
;
TimerHierarchy
timers
;
const
mav
<
complex
<
T
>
,
2
>
&
ms_in
;
mav
<
complex
<
T
>
,
2
>
&
ms_out
;
const
mav
<
T
,
2
>
&
dirty_in
;
mav
<
T
,
2
>
&
dirty_out
;
const
mav
<
T
,
2
>
&
wgt
;
const
mav
<
uint8_t
,
2
>
&
mask
;
double
pixsize_x
,
pixsize_y
;
size_t
nxdirty
,
nydirty
;
double
epsilon
;
bool
do_wgridding
;
size_t
nthreads
;
size_t
verbosity
;
bool
negate_v
,
divide_by_n
;
Baselines
bl
;
VVR
ranges
;
double
wmin_d
,
wmax_d
;
size_t
nvis
;
double
wmin
,
dw
;
size_t
nplanes
;
double
nm1min
;
vector
<
uint8_t
>
active
;
bool
gridding
;
TimerHierarchy
timers
;
const
mav
<
complex
<
T
>
,
2
>
&
ms_in
;
mav
<
complex
<
T
>
,
2
>
&
ms_out
;
const
mav
<
T
,
2
>
&
dirty_in
;
mav
<
T
,
2
>
&
dirty_out
;
const
mav
<
T
,
2
>
&
wgt
;
const
mav
<
uint8_t
,
2
>
&
mask
;
double
pixsize_x
,
pixsize_y
;
size_t
nxdirty
,
nydirty
;
double
epsilon
;
bool
do_wgridding
;
size_t
nthreads
;
size_t
verbosity
;
bool
negate_v
,
divide_by_n
;
Baselines
bl
;
VVR
ranges
;
double
wmin_d
,
wmax_d
;
size_t
nvis
;
double
wmin
,
dw
;
size_t
nplanes
;
double
nm1min
;
vector
<
uint8_t
>
active
;
size_t
nu
,
nv
;
double
ofactor
;
...
...
@@ -288,9 +293,11 @@ template<typename T> class Params
checkShape
(
dirty
.
shape
(),
{
nxdirty
,
nydirty
});
auto
cfu
=
krn
->
corfunc
(
nxdirty
/
2
+
1
,
1.
/
nu
,
nthreads
);
auto
cfv
=
krn
->
corfunc
(
nydirty
/
2
+
1
,
1.
/
nv
,
nthreads
);
exec
Static
(
nxdirty
,
nthreads
,
0
,
[
&
](
Scheduler
&
sched
)
exec
Parallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
while
(
auto
rng
=
sched
.
getNext
())
for
(
auto
i
=
rng
.
lo
;
i
<
rng
.
hi
;
++
i
)
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nxdirty
);
for
(
auto
i
=
lo
;
i
<
hi
;
++
i
)
{
int
icfu
=
abs
(
int
(
nxdirty
/
2
)
-
int
(
i
));
for
(
size_t
j
=
0
;
j
<
nydirty
;
++
j
)
...
...
@@ -311,13 +318,15 @@ template<typename T> class Params
checkShape
(
dirty
.
shape
(),
{
nxdirty
,
nydirty
});
double
x0
=
-
0.5
*
nxdirty
*
pixsize_x
,
y0
=
-
0.5
*
nydirty
*
pixsize_y
;
exec
Static
(
nxdirty
/
2
+
1
,
nthreads
,
0
,
[
&
](
Scheduler
&
sched
)
exec
Parallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nxdirty
/
2
+
1
);
using
vtype
=
native_simd
<
T
>
;
constexpr
size_t
vlen
=
vtype
::
size
();
size_t
nvec
=
(
nydirty
/
2
+
1
+
(
vlen
-
1
))
/
vlen
;
vector
<
vtype
>
ph
(
nvec
),
sp
(
nvec
),
cp
(
nvec
);
while
(
auto
rng
=
sched
.
getNext
())
for
(
auto
i
=
rng
.
lo
;
i
<
rng
.
hi
;
++
i
)
for
(
auto
i
=
lo
;
i
<
hi
;
++
i
)
{
T
fx
=
T
(
x0
+
i
*
pixsize_x
);
fx
*=
fx
;
...
...
@@ -398,10 +407,25 @@ template<typename T> class Params
auto
cfu
=
krn
->
corfunc
(
nxdirty
/
2
+
1
,
1.
/
nu
,
nthreads
);
auto
cfv
=
krn
->
corfunc
(
nydirty
/
2
+
1
,
1.
/
nv
,
nthreads
);
// FIXME: maybe we don't have to fill everything and can save some time
grid
.
fill
(
0
);
exec
Static
(
nxdirty
,
nthreads
,
0
,
[
&
](
Scheduler
&
sched
)
//
grid.fill(0);
exec
Parallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
while
(
auto
rng
=
sched
.
getNext
())
for
(
auto
i
=
rng
.
lo
;
i
<
rng
.
hi
;
++
i
)
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nu
);
for
(
auto
i
=
lo
;
i
<
hi
;
++
i
)
{
size_t
lo2
=
0
,
hi2
=
nv
;
if
((
i
<
nxdirty
/
2
)
||
(
i
>=
nu
-
nxdirty
/
2
))
{
lo2
=
nydirty
/
2
;
hi2
=
nv
-
nydirty
/
2
+
1
;
}
for
(
auto
j
=
lo2
;
j
<
hi2
;
++
j
)
grid
.
v
(
i
,
j
)
=
0
;
}
});
execParallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nxdirty
);
for
(
auto
i
=
lo
;
i
<
hi
;
++
i
)
{
int
icfu
=
abs
(
int
(
nxdirty
/
2
)
-
int
(
i
));
for
(
size_t
j
=
0
;
j
<
nydirty
;
++
j
)
...
...
@@ -422,17 +446,32 @@ template<typename T> class Params
checkShape
(
dirty
.
shape
(),
{
nxdirty
,
nydirty
});
checkShape
(
grid
.
shape
(),
{
nu
,
nv
});
// FIXME: maybe we don't have to fill everything and can save some time
grid
.
fill
(
0
);
// grid.fill(0);
execParallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nu
);
for
(
auto
i
=
lo
;
i
<
hi
;
++
i
)
{
size_t
lo2
=
0
,
hi2
=
nv
;
if
((
i
<
nxdirty
/
2
)
||
(
i
>=
nu
-
nxdirty
/
2
))
{
lo2
=
nydirty
/
2
;
hi2
=
nv
-
nydirty
/
2
+
1
;
}
for
(
auto
j
=
lo2
;
j
<
hi2
;
++
j
)
grid
.
v
(
i
,
j
)
=
0
;
}
});
double
x0
=
-
0.5
*
nxdirty
*
pixsize_x
,
y0
=
-
0.5
*
nydirty
*
pixsize_y
;
exec
Static
(
nxdirty
/
2
+
1
,
nthreads
,
0
,
[
&
](
Scheduler
&
sched
)
exec
Parallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nxdirty
/
2
+
1
);
using
vtype
=
native_simd
<
T
>
;
constexpr
size_t
vlen
=
vtype
::
size
();
size_t
nvec
=
(
nydirty
/
2
+
1
+
(
vlen
-
1
))
/
vlen
;
vector
<
vtype
>
ph
(
nvec
),
sp
(
nvec
),
cp
(
nvec
);
while
(
auto
rng
=
sched
.
getNext
())
for
(
auto
i
=
rng
.
lo
;
i
<
rng
.
hi
;
++
i
)
for
(
auto
i
=
lo
;
i
<
hi
;
++
i
)
{
T
fx
=
T
(
x0
+
i
*
pixsize_x
);
fx
*=
fx
;
...
...
@@ -511,784 +550,799 @@ template<typename T> class Params
iv0
=
min
(
int
(
v
+
vshift
)
-
int
(
nv
),
maxiv0
);
}
void
report
()
{
if
(
verbosity
==
0
)
return
;
cout
<<
(
gridding
?
"Gridding"
:
"Degridding"
)
<<
": nthreads="
<<
nthreads
<<
", "
<<
"dirty=("
<<
nxdirty
<<
"x"
<<
nydirty
<<
"), "
<<
"grid=("
<<
nu
<<
"x"
<<
nv
;
if
(
nplanes
>
0
)
cout
<<
"x"
<<
nplanes
;
cout
<<
"), nvis="
<<
nvis
<<
", supp="
<<
supp
<<
", eps="
<<
(
epsilon
*
((
nplanes
==
0
)
?
2
:
3
))
<<
endl
;
cout
<<
" w=["
<<
wmin_d
<<
"; "
<<
wmax_d
<<
"], min(n-1)="
<<
nm1min
<<
", dw="
<<
dw
<<
", wmax/dw="
<<
wmax_d
/
dw
<<
", nranges="
<<
ranges
.
size
()
<<
endl
;
}
void
scanData
()
{
timers
.
push
(
"Initial scan"
);
size_t
nrow
=
bl
.
Nrows
(),
nchan
=
bl
.
Nchannels
();
bool
have_wgt
=
wgt
.
size
()
!=
0
;
if
(
have_wgt
)
checkShape
(
wgt
.
shape
(),{
nrow
,
nchan
});
bool
have_ms
=
ms_in
.
size
()
!=
0
;
if
(
have_ms
)
checkShape
(
ms_in
.
shape
(),
{
nrow
,
nchan
});
bool
have_mask
=
mask
.
size
()
!=
0
;
if
(
have_mask
)
checkShape
(
mask
.
shape
(),
{
nrow
,
nchan
});
active
.
resize
(
nrow
*
nchan
,
0
);
nvis
=
0
;
wmin_d
=
1e300
;
wmax_d
=-
1e300
;
mutex
mut
;
execParallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
double
lwmin_d
=
1e300
,
lwmax_d
=-
1e300
;
size_t
lnvis
=
0
;
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nrow
);
for
(
auto
irow
=
lo
;
irow
<
hi
;
++
irow
)
for
(
size_t
ichan
=
0
,
idx
=
irow
*
nchan
;
ichan
<
nchan
;
++
ichan
,
++
idx
)
if
(((
!
have_ms
)
||
(
norm
(
ms_in
(
irow
,
ichan
))
!=
0
))
&&
((
!
have_wgt
)
||
(
wgt
(
irow
,
ichan
)
!=
0
))
&&
((
!
have_mask
)
||
(
mask
(
irow
,
ichan
)
!=
0
)))
{
++
lnvis
;
active
[
irow
*
nchan
+
ichan
]
=
1
;
auto
uvw
=
bl
.
effectiveCoord
(
irow
,
ichan
);
double
w
=
abs
(
uvw
.
w
);
lwmin_d
=
min
(
lwmin_d
,
w
);
lwmax_d
=
max
(
lwmax_d
,
w
);
}
{
lock_guard
<
mutex
>
lock
(
mut
);
wmin_d
=
min
(
wmin_d
,
lwmin_d
);
wmax_d
=
max
(
wmax_d
,
lwmax_d
);
nvis
+=
lnvis
;
}
});
timers
.
pop
();
}
auto
getNuNv
()
{
timers
.
push
(
"parameter calculation"
);
double
x0
=
-
0.5
*
nxdirty
*
pixsize_x
,
y0
=
-
0.5
*
nydirty
*
pixsize_y
;
nm1min
=
sqrt
(
max
(
1.
-
x0
*
x0
-
y0
*
y0
,
0.
))
-
1.
;
if
(
x0
*
x0
+
y0
*
y0
>
1.
)
nm1min
=
-
sqrt
(
abs
(
1.
-
x0
*
x0
-
y0
*
y0
))
-
1.
;
auto
idx
=
getAvailableKernels
<
T
>
(
epsilon
);
double
mincost
=
1e300
;
constexpr
double
nref_fft
=
2048
;
constexpr
double
costref_fft
=
0.0693
;
size_t
minnu
=
0
,
minnv
=
0
,
minidx
=
KernelDB
.
size
();
constexpr
size_t
vlen
=
native_simd
<
T
>::
size
();
for
(
size_t
i
=
0
;
i
<
idx
.
size
();
++
i
)
{
const
auto
&
krn
(
KernelDB
[
idx
[
i
]]);
auto
supp
=
krn
.
W
;
auto
nvec
=
(
supp
+
vlen
-
1
)
/
vlen
;
auto
ofactor
=
krn
.
ofactor
;
size_t
nu
=
2
*
good_size_complex
(
size_t
(
nxdirty
*
ofactor
*
0.5
)
+
1
);
size_t
nv
=
2
*
good_size_complex
(
size_t
(
nydirty
*
ofactor
*
0.5
)
+
1
);
double
logterm
=
log
(
nu
*
nv
)
/
log
(
nref_fft
*
nref_fft
);
double
fftcost
=
nu
/
nref_fft
*
nv
/
nref_fft
*
logterm
*
costref_fft
;
double
gridcost
=
2.2e-10
*
nvis
*
(
supp
*
nvec
*
vlen
+
((
2
*
nvec
+
1
)
*
(
supp
+
3
)
*
vlen
));
if
(
do_wgridding
)
void
report
()
{
double
dw
=
0.5
/
ofactor
/
abs
(
nm1min
);
size_t
nplanes
=
size_t
((
wmax_d
-
wmin_d
)
/
dw
+
supp
);
fftcost
*=
nplanes
;
gridcost
*=
supp
;
if
(
verbosity
==
0
)
return
;
cout
<<
(
gridding
?
"Gridding"
:
"Degridding"
)
<<
": nthreads="
<<
nthreads
<<
", "
<<
"dirty=("
<<
nxdirty
<<
"x"
<<
nydirty
<<
"), "
<<
"grid=("
<<
nu
<<
"x"
<<
nv
;
if
(
nplanes
>
0
)
cout
<<
"x"
<<
nplanes
;
cout
<<
"), nvis="
<<
nvis
<<
", supp="
<<
supp
<<
", eps="
<<
(
epsilon
*
((
nplanes
==
0
)
?
2
:
3
))
<<
endl
;
cout
<<
" w=["
<<
wmin_d
<<
"; "
<<
wmax_d
<<
"], min(n-1)="
<<
nm1min
<<
", dw="
<<
dw
<<
", wmax/dw="
<<
wmax_d
/
dw
<<
", nranges="
<<
ranges
.
size
()
<<
endl
;
}
double
cost
=
fftcost
+
gridcost
;
if
(
cost
<
mincost
)
void
scanData
(
)
{
mincost
=
cost
;
minnu
=
nu
;
minnv
=
nv
;
minidx
=
idx
[
i
];
timers
.
push
(
"Initial scan"
);
size_t
nrow
=
bl
.
Nrows
(),
nchan
=
bl
.
Nchannels
();
bool
have_wgt
=
wgt
.
size
()
!=
0
;
if
(
have_wgt
)
checkShape
(
wgt
.
shape
(),{
nrow
,
nchan
});
bool
have_ms
=
ms_in
.
size
()
!=
0
;
if
(
have_ms
)
checkShape
(
ms_in
.
shape
(),
{
nrow
,
nchan
});
bool
have_mask
=
mask
.
size
()
!=
0
;
if
(
have_mask
)
checkShape
(
mask
.
shape
(),
{
nrow
,
nchan
});
active
.
resize
(
nrow
*
nchan
,
0
);
nvis
=
0
;
wmin_d
=
1e300
;
wmax_d
=-
1e300
;
mutex
mut
;
execParallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
double
lwmin_d
=
1e300
,
lwmax_d
=-
1e300
;
size_t
lnvis
=
0
;
auto
tid
=
sched
.
thread_num
();
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nrow
);
for
(
auto
irow
=
lo
;
irow
<
hi
;
++
irow
)
for
(
size_t
ichan
=
0
,
idx
=
irow
*
nchan
;
ichan
<
nchan
;
++
ichan
,
++
idx
)
if
(((
!
have_ms
)
||
(
norm
(
ms_in
(
irow
,
ichan
))
!=
0
))
&&
((
!
have_wgt
)
||
(
wgt
(
irow
,
ichan
)
!=
0
))
&&
((
!
have_mask
)
||
(
mask
(
irow
,
ichan
)
!=
0
)))
{
++
lnvis
;
active
[
irow
*
nchan
+
ichan
]
=
1
;
auto
uvw
=
bl
.
effectiveCoord
(
irow
,
ichan
);
double
w
=
abs
(
uvw
.
w
);
lwmin_d
=
min
(
lwmin_d
,
w
);
lwmax_d
=
max
(
lwmax_d
,
w
);
}
{
lock_guard
<
mutex
>
lock
(
mut
);
wmin_d
=
min
(
wmin_d
,
lwmin_d
);
wmax_d
=
max
(
wmax_d
,
lwmax_d
);
nvis
+=
lnvis
;
}
});
timers
.
pop
();
}
}
timers
.
pop
();
nu
=
minnu
;
nv
=
minnv
;
return
minidx
;
}
void
countRanges
()
{
timers
.
push
(
"range count"
);
size_t
nrow
=
bl
.
Nrows
(),
nchan
=
bl
.
Nchannels
();
dw
=
0.5
/
ofactor
/
abs
(
nm1min
);
nplanes
=
size_t
((
wmax_d
-
wmin_d
)
/
dw
+
supp
);
wmin
=
(
wmin_d
+
wmax_d
)
*
0.5
-
0.5
*
(
nplanes
-
1
)
*
dw
;
struct
bufvec
{
VVR
v
;
uint64_t
dummy
[
8
];
};
auto
Sorter
=
[](
const
visrange
&
a
,
const
visrange
&
b
)
{
return
a
.
uvwidx
()
<
b
.
uvwidx
();
};
vector
<
bufvec
>
lranges
(
nthreads
);
execParallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
auto
tid
=
sched
.
thread_num
();
auto
&
myranges
(
lranges
[
tid
].
v
);
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nrow
);
for
(
auto
irow
=
lo
;
irow
<
hi
;
++
irow
)
auto
getNuNv
()
{
bool
on
=
false
;
int
iulast
,
ivlast
,
plast
;
size_t
chan0
=
0
;
for
(
size_t
ichan
=
0
;
ichan
<
nchan
;
++
ichan
)
timers
.
push
(
"parameter calculation"
);
double
x0
=
-
0.5
*
nxdirty
*
pixsize_x
,
y0
=
-
0.5
*
nydirty
*
pixsize_y
;
nm1min
=
sqrt
(
max
(
1.
-
x0
*
x0
-
y0
*
y0
,
0.
))
-
1.
;
if
(
x0
*
x0
+
y0
*
y0
>
1.
)
nm1min
=
-
sqrt
(
abs
(
1.
-
x0
*
x0
-
y0
*
y0
))
-
1.
;
auto
idx
=
getAvailableKernels
<
T
>
(
epsilon
);
double
mincost
=
1e300
;
constexpr
double
nref_fft
=
2048
;
constexpr
double
costref_fft
=
0.0693
;
size_t
minnu
=
0
,
minnv
=
0
,
minidx
=
KernelDB
.
size
();
constexpr
size_t
vlen
=
native_simd
<
T
>::
size
();
for
(
size_t
i
=
0
;
i
<
idx
.
size
();
++
i
)
{
if
(
active
[
irow
*
nchan
+
ichan
])
const
auto
&
krn
(
KernelDB
[
idx
[
i
]]);
auto
supp
=
krn
.
W
;
auto
nvec
=
(
supp
+
vlen
-
1
)
/
vlen
;
auto
ofactor
=
krn
.
ofactor
;
size_t
nu
=
2
*
good_size_complex
(
size_t
(
nxdirty
*
ofactor
*
0.5
)
+
1
);
size_t
nv
=
2
*
good_size_complex
(
size_t
(
nydirty
*
ofactor
*
0.5
)
+
1
);
double
logterm
=
log
(
nu
*
nv
)
/
log
(
nref_fft
*
nref_fft
);
double
fftcost
=
nu
/
nref_fft
*
nv
/
nref_fft
*
logterm
*
costref_fft
;
double
gridcost
=
2.2e-10
*
nvis
*
(
supp
*
nvec
*
vlen
+
((
2
*
nvec
+
1
)
*
(
supp
+
3
)
*
vlen
));
if
(
do_wgridding
)
{
auto
uvw
=
bl
.
effectiveCoord
(
irow
,
ichan
);
if
(
uvw
.
w
<
0
)
uvw
.
Flip
();
double
u
,
v
;
int
iu0
,
iv0
,
iw
;
getpix
(
uvw
.
u
,
uvw
.
v
,
u
,
v
,
iu0
,
iv0
);
iu0
=
(
iu0
+
nsafe
)
>>
logsquare
;
iv0
=
(
iv0
+
nsafe
)
>>
logsquare
;
iw
=
max
(
0
,
int
(
1
+
(
abs
(
uvw
.
w
)
-
(
0.5
*
supp
*
dw
)
-
wmin
)
/
dw
));
if
(
!
on
)
// new active region
{
on
=
true
;
iulast
=
iu0
;
ivlast
=
iv0
;
plast
=
iw
;
chan0
=
ichan
;
}
else
if
((
iu0
!=
iulast
)
||
(
iv0
!=
ivlast
)
||
(
iw
!=
plast
))
// change of active region
{
myranges
.
emplace_back
(
iulast
,
ivlast
,
plast
,
irow
,
chan0
,
ichan
);
iulast
=
iu0
;
ivlast
=
iv0
;
plast
=
iw
;
chan0
=
ichan
;
}
double
dw
=
0.5
/
ofactor
/
abs
(
nm1min
);
size_t
nplanes
=
size_t
((
wmax_d
-
wmin_d
)
/
dw
+
supp
);
fftcost
*=
nplanes
;
gridcost
*=
supp
;
}
else
if
(
on
)
// end of active region
double
cost
=
fftcost
+
gridcost
;
if
(
cost
<
mincost
)
{
myranges
.
emplace_back
(
iulast
,
ivlast
,
plast
,
irow
,
chan0
,
ichan
);
on
=
false
;
mincost
=
cost
;
minnu
=
nu
;
minnv
=
nv
;
minidx
=
idx
[
i
];
}
}
if
(
on
)
// end of active region at last channel
myranges
.
emplace_back
(
iulast
,
ivlast
,
plast
,
irow
,
chan0
,
nchan
);
timers
.
pop
();
nu
=
minnu
;
nv
=
minnv
;
return
minidx
;
}
sort
(
myranges
.
begin
(),
myranges
.
end
(),
Sorter
);
});
// free mask memory
vector
<
uint8_t
>
().
swap
(
active
);
timers
.
poppush
(
"range merging"
);
size_t
nth
=
nthreads
;
while
(
nth
>
1
)
{
auto
nmerge
=
nth
/
2
;
execParallel
(
nmerge
,
[
&
](
Scheduler
&
sched
)
void
countRanges
()
{
auto
tid
=
sched
.
thread_num
();
auto
tid_partner
=
nth
-
1
-
tid
;
VVR
tmp
;
tmp
.
reserve
(
lranges
[
tid
].
v
.
size
()
+
lranges
[
tid_partner
].
v
.
size
());
merge
(
lranges
[
tid
].
v
.
begin
(),
lranges
[
tid
].
v
.
end
(),
lranges
[
tid_partner
].
v
.
begin
(),
lranges
[
tid_partner
].
v
.
end
(),
back_inserter
(
tmp
),
Sorter
);
lranges
[
tid
].
v
.
swap
(
tmp
);
VVR
().
swap
(
lranges
[
tid_partner
].
v
);
});
nth
-=
nmerge
;
}
ranges
.
swap
(
lranges
[
0
].
v
);
timers
.
pop
();
}
timers
.
push
(
"range count"
);
size_t
nrow
=
bl
.
Nrows
(),
nchan
=
bl
.
Nchannels
();
void
apply_global_corrections
(
mav
<
T
,
2
>
&
dirty
)
{
timers
.
push
(
"global corrections"
);
double
x0
=
-
0.5
*
nxdirty
*
pixsize_x
,
y0
=
-
0.5
*
nydirty
*
pixsize_y
;
auto
cfu
=
krn
->
corfunc
(
nxdirty
/
2
+
1
,
1.
/
nu
,
nthreads
);
auto
cfv
=
krn
->
corfunc
(
nydirty
/
2
+
1
,
1.
/
nv
,
nthreads
);
execStatic
(
nxdirty
/
2
+
1
,
nthreads
,
0
,
[
&
](
Scheduler
&
sched
)
{
while
(
auto
rng
=
sched
.
getNext
())
for
(
auto
i
=
rng
.
lo
;
i
<
rng
.
hi
;
++
i
)
{
auto
fx
=
T
(
x0
+
i
*
pixsize_x
);
fx
*=
fx
;
for
(
size_t
j
=
0
;
j
<=
nydirty
/
2
;
++
j
)
dw
=
0.5
/
ofactor
/
abs
(
nm1min
);
nplanes
=
size_t
((
wmax_d
-
wmin_d
)
/
dw
+
supp
);
wmin
=
(
wmin_d
+
wmax_d
)
*
0.5
-
0.5
*
(
nplanes
-
1
)
*
dw
;
struct
bufvec
{
auto
fy
=
T
(
y0
+
j
*
pixsize_y
);
fy
*=
fy
;
T
fct
=
0
;
auto
tmp
=
1
-
fx
-
fy
;
if
(
tmp
>=
0
)
{
auto
nm1
=
(
-
fx
-
fy
)
/
(
sqrt
(
tmp
)
+
1
);
// accurate form of sqrt(1-x-y)-1
fct
=
T
(
krn
->
corfunc
(
nm1
*
dw
));
if
(
divide_by_n
)
fct
/=
nm1
+
1
;
}
else
// beyond the horizon, don't really know what to do here
VVR
v
;
uint64_t
dummy
[
8
];
};
auto
Sorter
=
[](
const
visrange
&
a
,
const
visrange
&
b
)
{
return
a
.
uvwidx
()
<
b
.
uvwidx
();
};
vector
<
bufvec
>
lranges
(
nthreads
);
execParallel
(
nthreads
,
[
&
](
Scheduler
&
sched
)
{
auto
tid
=
sched
.
thread_num
();
auto
&
myranges
(
lranges
[
tid
].
v
);
auto
[
lo
,
hi
]
=
calcShare
(
nthreads
,
tid
,
nrow
);
for
(
auto
irow
=
lo
;
irow
<
hi
;
++
irow
)
{
if
(
divide_by_n
)
fct
=
0
;
else
bool
on
=
false
;
int
iulast
,
ivlast
,
plast
;
size_t
chan0
=
0
;
for
(
size_t
ichan
=
0
;
ichan
<
nchan
;
++
ichan
)
{
auto
nm1
=
sqrt
(
-
tmp
)
-
1
;
fct
=
T
(
krn
->
corfunc
(
nm1
*
dw
));
if
(
active
[
irow
*
nchan
+
ichan
])
{
auto
uvw
=
bl
.
effectiveCoord
(
irow
,
ichan
);
if
(
uvw
.
w
<
0
)
uvw
.
Flip
();
double
u
,
v
;
int
iu0
,
iv0
,
iw
;
getpix
(
uvw
.
u
,
uvw
.
v
,
u
,
v
,
iu0
,
iv0
);
iu0
=
(
iu0
+
nsafe
)
>>
logsquare
;
iv0
=
(
iv0
+
nsafe
)
>>
logsquare
;
iw
=
max
(
0
,
int
(
1
+
(
abs
(
uvw
.
w
)
-
(
0.5
*
supp
*
dw
)
-
wmin
)
/
dw
));
if
(
!
on
)
// new active region
{
on
=
true
;
iulast
=
iu0
;
ivlast
=
iv0
;
plast
=
iw
;
chan0
=
ichan
;
}
else
if
((
iu0
!=
iulast
)
||
(
iv0
!=
ivlast
)
||
(
iw
!=
plast
))
// change of active region
{
myranges
.
emplace_back
(
iulast
,
ivlast
,
plast
,
irow
,
chan0
,
ichan
);
iulast
=
iu0
;
ivlast
=
iv0
;
plast
=
iw
;
chan0
=
ichan
;
}
}
else
if
(
on
)
// end of active region
{
myranges
.
emplace_back
(
iulast
,
ivlast
,
plast
,
irow
,
chan0
,
ichan
);
on
=
false
;
}
}
if
(
on
)
// end of active region at last channel
myranges
.
emplace_back
(
iulast
,
ivlast
,
plast
,
irow
,
chan0
,
nchan
);
}
fct
*=
T
(
cfu
[
nxdirty
/
2
-
i
]
*
cfv
[
nydirty
/
2
-
j
]);