Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Martin Reinecke
ducc
Commits
4b36a29f
Commit
4b36a29f
authored
Apr 24, 2020
by
Martin Reinecke
Browse files
Merge branch 'convolution_work' into 'master'
Convolution work See merge request mtr/cxxbase!7
parents
474ffe65
ed1b7cf7
Pipeline
#73641
passed with stages
in 8 minutes and 49 seconds
Changes
4
Pipelines
1
Show whitespace changes
Inline
Side-by-side
pyinterpol_ng/demo.py
View file @
4b36a29f
...
...
@@ -42,11 +42,13 @@ def convolve(alm1, alm2, lmax):
lmax
=
60
kmax
=
13
ncomp
=
1
separate
=
Tru
e
n
comp2
=
ncomp
if
separate
else
1
separate
=
Fals
e
n
ptg
=
1000000
epsilon
=
1e-4
ofactor
=
2
nthreads
=
0
ofactor
=
1.5
nthreads
=
0
# use as many threads as available
ncomp2
=
ncomp
if
separate
else
1
# get random sky a_lm
# the a_lm arrays follow the same conventions as those in healpy
...
...
@@ -93,13 +95,14 @@ plt.imshow(bar2-bar[:,:,0])
plt
.
show
()
ptg
=
np
.
random
.
uniform
(
0.
,
1.
,
3
*
1000000
).
reshape
(
1000000
,
3
)
ptg
=
np
.
random
.
uniform
(
0.
,
1.
,
3
*
nptg
).
reshape
(
nptg
,
3
)
ptg
[:,
0
]
*=
np
.
pi
ptg
[:,
1
]
*=
2
*
np
.
pi
ptg
[:,
2
]
*=
2
*
np
.
pi
#foo = pyinterpol_ng.PyInterpolator(slm,blm,separate,lmax, kmax, epsilon=1e-6, nthreads=2)
t0
=
time
.
time
()
bar
=
foo
.
interpol
(
ptg
)
del
foo
print
(
"interpolation time: "
,
time
.
time
()
-
t0
)
fake
=
np
.
random
.
uniform
(
0.
,
1.
,
(
ptg
.
shape
[
0
],
ncomp2
))
foo2
=
pyinterpol_ng
.
PyInterpolator
(
lmax
,
kmax
,
ncomp2
,
epsilon
=
epsilon
,
ofactor
=
ofactor
,
nthreads
=
nthreads
)
...
...
pyinterpol_ng/interpol_ng.h
View file @
4b36a29f
...
...
@@ -22,6 +22,96 @@
namespace
mr
{
namespace
detail_fft
{
using
std
::
vector
;
template
<
typename
T
,
typename
T0
>
aligned_array
<
T
>
alloc_tmp_conv
(
const
fmav_info
&
info
,
size_t
axis
,
size_t
len
)
{
auto
othersize
=
info
.
size
()
/
info
.
shape
(
axis
);
constexpr
auto
vlen
=
native_simd
<
T0
>::
size
();
auto
tmpsize
=
len
*
((
othersize
>=
vlen
)
?
vlen
:
1
);
return
aligned_array
<
T
>
(
tmpsize
);
}
template
<
typename
Tplan
,
typename
T
,
typename
T0
,
typename
Exec
>
MRUTIL_NOINLINE
void
general_convolve
(
const
fmav
<
T
>
&
in
,
fmav
<
T
>
&
out
,
const
size_t
axis
,
const
vector
<
T0
>
&
kernel
,
size_t
nthreads
,
const
Exec
&
exec
)
{
std
::
shared_ptr
<
Tplan
>
plan1
,
plan2
;
size_t
l_in
=
in
.
shape
(
axis
),
l_out
=
out
.
shape
(
axis
);
size_t
l_min
=
std
::
min
(
l_in
,
l_out
),
l_max
=
std
::
max
(
l_in
,
l_out
);
MR_assert
(
kernel
.
size
()
==
l_min
/
2
+
1
,
"bad kernel size"
);
plan1
=
get_plan
<
Tplan
>
(
l_in
);
plan2
=
get_plan
<
Tplan
>
(
l_out
);
execParallel
(
util
::
thread_count
(
nthreads
,
in
,
axis
,
native_simd
<
T0
>::
size
()),
[
&
](
Scheduler
&
sched
)
{
constexpr
auto
vlen
=
native_simd
<
T0
>::
size
();
auto
storage
=
alloc_tmp_conv
<
T
,
T0
>
(
in
,
axis
,
l_max
);
multi_iter
<
vlen
>
it
(
in
,
out
,
axis
,
sched
.
num_threads
(),
sched
.
thread_num
());
#ifndef MRUTIL_NO_SIMD
if
(
vlen
>
1
)
while
(
it
.
remaining
()
>=
vlen
)
{
it
.
advance
(
vlen
);
auto
tdatav
=
reinterpret_cast
<
add_vec_t
<
T
>
*>
(
storage
.
data
());
exec
(
it
,
in
,
out
,
tdatav
,
*
plan1
,
*
plan2
,
kernel
);
}
#endif
while
(
it
.
remaining
()
>
0
)
{
it
.
advance
(
1
);
auto
buf
=
reinterpret_cast
<
T
*>
(
storage
.
data
());
exec
(
it
,
in
,
out
,
buf
,
*
plan1
,
*
plan2
,
kernel
);
}
});
// end of parallel region
}
struct
ExecConvR1
{
template
<
typename
T0
,
typename
T
,
size_t
vlen
>
void
operator
()
(
const
multi_iter
<
vlen
>
&
it
,
const
fmav
<
T0
>
&
in
,
fmav
<
T0
>
&
out
,
T
*
buf
,
const
pocketfft_r
<
T0
>
&
plan1
,
const
pocketfft_r
<
T0
>
&
plan2
,
const
vector
<
T0
>
&
kernel
)
const
{
size_t
l_in
=
plan1
.
length
(),
l_out
=
plan2
.
length
(),
l_min
=
std
::
min
(
l_in
,
l_out
);
copy_input
(
it
,
in
,
buf
);
plan1
.
exec
(
buf
,
T0
(
1
),
true
);
for
(
size_t
i
=
0
;
i
<
l_min
;
++
i
)
buf
[
i
]
*=
kernel
[(
i
+
1
)
/
2
];
for
(
size_t
i
=
l_in
;
i
<
l_out
;
++
i
)
buf
[
i
]
=
T
(
0
);
plan2
.
exec
(
buf
,
T0
(
1
),
false
);
copy_output
(
it
,
buf
,
out
);
}
};
template
<
typename
T
>
void
convolve_1d
(
const
fmav
<
T
>
&
in
,
fmav
<
T
>
&
out
,
size_t
axis
,
const
vector
<
T
>
&
kernel
,
size_t
nthreads
=
1
)
{
MR_assert
(
axis
<
in
.
ndim
(),
"bad axis number"
);
MR_assert
(
in
.
ndim
()
==
out
.
ndim
(),
"dimensionality mismatch"
);
if
(
in
.
data
()
==
out
.
data
())
MR_assert
(
in
.
stride
()
==
out
.
stride
(),
"strides mismatch"
);
for
(
size_t
i
=
0
;
i
<
in
.
ndim
();
++
i
)
if
(
i
!=
axis
)
MR_assert
(
in
.
shape
(
i
)
==
out
.
shape
(
i
),
"shape mismatch"
);
MR_assert
(
!
((
in
.
shape
(
axis
)
&
1
)
||
(
out
.
shape
(
axis
)
&
1
)),
"input and output axis lengths must be even"
);
if
(
in
.
size
()
==
0
)
return
;
general_convolve
<
pocketfft_r
<
T
>>
(
in
,
out
,
axis
,
kernel
,
nthreads
,
ExecConvR1
());
}
}
using
detail_fft
::
convolve_1d
;
namespace
detail_interpol_ng
{
using
namespace
std
;
...
...
@@ -41,78 +131,54 @@ template<typename T> class Interpolator
void
correct
(
mav
<
T
,
2
>
&
arr
,
int
spin
)
{
T
sfct
=
(
spin
&
1
)
?
-
1
:
1
;
mav
<
T
,
2
>
tmp
({
nphi
,
nphi
});
tmp
.
apply
([](
T
&
v
){
v
=
0.
;});
auto
tmp0
=
tmp
.
template
subarray
<
2
>({
0
,
0
},{
nphi0
,
nphi0
});
fmav
<
T
>
ftmp0
(
tmp0
);
for
(
size_t
i
=
0
;
i
<
ntheta0
;
++
i
)
mav
<
T
,
2
>
tmp
({
nphi
,
nphi0
});
// copy and extend to second half
for
(
size_t
j
=
0
;
j
<
nphi0
;
++
j
)
tmp0
.
v
(
i
,
j
)
=
arr
(
i
,
j
);
// extend to second half
tmp
.
v
(
0
,
j
)
=
arr
(
0
,
j
);
for
(
size_t
i
=
1
,
i2
=
nphi0
-
1
;
i
+
1
<
ntheta0
;
++
i
,
--
i2
)
for
(
size_t
j
=
0
,
j2
=
nphi0
/
2
;
j
<
nphi0
;
++
j
,
++
j2
)
{
if
(
j2
>=
nphi0
)
j2
-=
nphi0
;
tmp0
.
v
(
i2
,
j
)
=
sfct
*
tmp0
(
i
,
j2
);
}
// FFT to frequency domain on minimal grid
r2r_fftpack
(
ftmp0
,
ftmp0
,{
0
,
1
},
true
,
true
,
T
(
1.
/
(
nphi0
*
nphi0
)),
nthreads
);
// correct amplitude at Nyquist frequency
for
(
size_t
i
=
0
;
i
<
nphi0
;
++
i
)
{
tmp0
.
v
(
i
,
nphi0
-
1
)
*=
0.5
;
tmp0
.
v
(
nphi0
-
1
,
i
)
*=
0.5
;
tmp
.
v
(
i
,
j2
)
=
arr
(
i
,
j2
);
tmp
.
v
(
i2
,
j
)
=
sfct
*
tmp
(
i
,
j2
);
}
auto
fct
=
kernel
.
correction_factors
(
nphi
,
nphi0
/
2
+
1
,
nthreads
);
for
(
size_t
i
=
0
;
i
<
nphi0
;
++
i
)
for
(
size_t
j
=
0
;
j
<
nphi0
;
++
j
)
tmp
0
.
v
(
i
,
j
)
*
=
fct
[(
i
+
1
)
/
2
]
*
fct
[(
j
+
1
)
/
2
]
;
auto
tmp1
=
tmp
.
template
subarray
<
2
>({
0
,
0
},{
nphi
,
nphi0
}
);
f
mav
<
T
>
ftmp1
(
tmp1
)
;
// zero-padded FFT in theta direction
r2r_fftpack
(
ftmp1
,
ftmp1
,{
0
},
false
,
false
,
T
(
1
),
nthreads
);
auto
tmp2
=
tmp
.
template
subarray
<
2
>({
0
,
0
},{
ntheta
,
nphi
}
);
fmav
<
T
>
ftmp2
(
tmp
2
);
tmp
.
v
(
ntheta0
-
1
,
j
)
=
arr
(
ntheta0
-
1
,
j
)
;
auto
fct
=
kernel
.
correction_factors
(
nphi
,
nphi0
/
2
+
1
,
nthreads
);
f
or
(
auto
&
f
:
fct
)
f
/=
nphi0
;
fmav
<
T
>
ftmp
(
tmp
);
fmav
<
T
>
ftmp0
(
tmp
.
template
subarray
<
2
>({
0
,
0
},{
nphi0
,
nphi0
})
);
convolve_1d
(
ftmp0
,
ftmp
,
0
,
fct
,
nthreads
);
fmav
<
T
>
ftmp2
(
tmp
.
template
subarray
<
2
>({
0
,
0
},{
ntheta
,
nphi0
})
);
fmav
<
T
>
farr
(
arr
);
// zero-padded FFT in phi direction
r2r_fftpack
(
ftmp2
,
farr
,{
1
},
false
,
false
,
T
(
1
),
nthreads
);
convolve_1d
(
ftmp2
,
farr
,
1
,
fct
,
nthreads
);
}
void
decorrect
(
mav
<
T
,
2
>
&
arr
,
int
spin
)
{
T
sfct
=
(
spin
&
1
)
?
-
1
:
1
;
mav
<
T
,
2
>
tmp
({
nphi
,
nphi
});
fmav
<
T
>
ftmp
(
tmp
);
f
or
(
size_t
i
=
0
;
i
<
ntheta
;
++
i
)
for
(
size_t
j
=
0
;
j
<
nphi
;
++
j
)
tmp
.
v
(
i
,
j
)
=
arr
(
i
,
j
);
mav
<
T
,
2
>
tmp
({
nphi
,
nphi
0
});
auto
fct
=
kernel
.
correction_factors
(
nphi
,
nphi0
/
2
+
1
,
nthreads
);
for
(
auto
&
f
:
fct
)
f
/=
nphi0
;
f
mav
<
T
>
farr
(
arr
);
fmav
<
T
>
ftmp2
(
tmp
.
template
subarray
<
2
>({
0
,
0
},{
ntheta
,
nphi0
}));
convolve_1d
(
farr
,
ftmp2
,
1
,
fct
,
nthreads
);
// extend to second half
for
(
size_t
i
=
1
,
i2
=
nphi
-
1
;
i
+
1
<
ntheta
;
++
i
,
--
i2
)
for
(
size_t
j
=
0
,
j2
=
nphi
/
2
;
j
<
nphi
;
++
j
,
++
j2
)
for
(
size_t
j
=
0
,
j2
=
nphi
0
/
2
;
j
<
nphi
0
;
++
j
,
++
j2
)
{
if
(
j2
>=
nphi
)
j2
-=
nphi
;
if
(
j2
>=
nphi
0
)
j2
-=
nphi
0
;
tmp
.
v
(
i2
,
j
)
=
sfct
*
tmp
(
i
,
j2
);
}
r2r_fftpack
(
ftmp
,
ftmp
,{
1
},
true
,
true
,
T
(
1
),
nthreads
);
auto
tmp1
=
tmp
.
template
subarray
<
2
>({
0
,
0
},{
nphi
,
nphi0
});
fmav
<
T
>
ftmp1
(
tmp1
);
r2r_fftpack
(
ftmp1
,
ftmp1
,{
0
},
true
,
true
,
T
(
1
),
nthreads
);
auto
fct
=
kernel
.
correction_factors
(
nphi
,
nphi0
/
2
+
1
,
nthreads
);
auto
tmp0
=
tmp
.
template
subarray
<
2
>({
0
,
0
},{
nphi0
,
nphi0
});
fmav
<
T
>
ftmp0
(
tmp0
);
for
(
size_t
i
=
0
;
i
<
nphi0
;
++
i
)
fmav
<
T
>
ftmp
(
tmp
);
fmav
<
T
>
ftmp0
(
tmp
.
template
subarray
<
2
>({
0
,
0
},{
nphi0
,
nphi0
}));
convolve_1d
(
ftmp
,
ftmp0
,
0
,
fct
,
nthreads
);
for
(
size_t
j
=
0
;
j
<
nphi0
;
++
j
)
tmp0
.
v
(
i
,
j
)
*=
fct
[(
i
+
1
)
/
2
]
*
fct
[(
j
+
1
)
/
2
];
// FFT to (theta, phi) domain on minimal grid
r2r_fftpack
(
ftmp0
,
ftmp0
,{
0
,
1
},
false
,
false
,
T
(
1.
/
(
nphi0
*
nphi0
)),
nthreads
);
arr
.
v
(
0
,
j
)
=
0.5
*
tmp
(
0
,
j
);
for
(
size_t
i
=
1
;
i
+
1
<
ntheta0
;
++
i
)
for
(
size_t
j
=
0
;
j
<
nphi0
;
++
j
)
{
tmp0
.
v
(
0
,
j
)
*=
0.5
;
tmp0
.
v
(
ntheta0
-
1
,
j
)
*=
0.5
;
}
for
(
size_t
i
=
0
;
i
<
ntheta0
;
++
i
)
arr
.
v
(
i
,
j
)
=
tmp
(
i
,
j
);
for
(
size_t
j
=
0
;
j
<
nphi0
;
++
j
)
arr
.
v
(
i
,
j
)
=
tmp0
(
i
,
j
);
arr
.
v
(
ntheta0
-
1
,
j
)
=
0.5
*
tmp
(
ntheta0
-
1
,
j
);
}
vector
<
size_t
>
getIdx
(
const
mav
<
T
,
2
>
&
ptg
)
const
...
...
pyinterpol_ng/pyinterpol_ng.cc
View file @
4b36a29f
...
...
@@ -242,16 +242,28 @@ PYBIND11_MODULE(pyinterpol_ng, m)
m
.
doc
()
=
pyinterpol_ng_DS
;
py
::
class_
<
PyInterpolator
<
fptype
>>
(
m
,
"PyInterpolator"
,
pyinterpolator_DS
)
using
inter_d
=
PyInterpolator
<
double
>
;
py
::
class_
<
inter_d
>
(
m
,
"PyInterpolator"
,
pyinterpolator_DS
)
.
def
(
py
::
init
<
const
py
::
array
&
,
const
py
::
array
&
,
bool
,
int64_t
,
int64_t
,
fptype
,
fptype
,
int
>
(),
initnormal_DS
,
"sky"
_a
,
"beam"
_a
,
"separate"
_a
,
"lmax"
_a
,
"kmax"
_a
,
"epsilon"
_a
,
"ofactor"
_a
=
fptype
(
1.5
),
"nthreads"
_a
=
0
)
.
def
(
py
::
init
<
int64_t
,
int64_t
,
int64_t
,
fptype
,
fptype
,
int
>
(),
initadjoint_DS
,
"lmax"
_a
,
"kmax"
_a
,
"ncomp"
_a
,
"epsilon"
_a
,
"ofactor"
_a
=
fptype
(
1.5
),
"nthreads"
_a
=
0
)
.
def
(
"interpol"
,
&
PyInterpolator
<
fptype
>::
pyinterpol
,
interpol_DS
,
"ptg"
_a
)
.
def
(
"deinterpol"
,
&
PyInterpolator
<
fptype
>::
pydeinterpol
,
deinterpol_DS
,
"ptg"
_a
,
"data"
_a
)
.
def
(
"getSlm"
,
&
PyInterpolator
<
fptype
>::
pygetSlm
,
getSlm_DS
,
"beam"
_a
)
.
def
(
"support"
,
&
PyInterpolator
<
fptype
>::
support
);
.
def
(
"interpol"
,
&
inter_d
::
pyinterpol
,
interpol_DS
,
"ptg"
_a
)
.
def
(
"deinterpol"
,
&
inter_d
::
pydeinterpol
,
deinterpol_DS
,
"ptg"
_a
,
"data"
_a
)
.
def
(
"getSlm"
,
&
inter_d
::
pygetSlm
,
getSlm_DS
,
"beam"
_a
)
.
def
(
"support"
,
&
inter_d
::
support
);
// using inter_f = PyInterpolator<float>;
// py::class_<inter_f> (m, "PyInterpolator_f", pyinterpolator_DS)
// .def(py::init<const py::array &, const py::array &, bool, int64_t, int64_t, fptype, fptype, int>(),
// initnormal_DS, "sky"_a, "beam"_a, "separate"_a, "lmax"_a, "kmax"_a, "epsilon"_a, "ofactor"_a=fptype(1.5),
// "nthreads"_a=0)
// .def(py::init<int64_t, int64_t, int64_t, fptype, fptype, int>(), initadjoint_DS,
// "lmax"_a, "kmax"_a, "ncomp"_a, "epsilon"_a, "ofactor"_a=fptype(1.5), "nthreads"_a=0)
// .def ("interpol", &inter_f::pyinterpol, interpol_DS, "ptg"_a)
// .def ("deinterpol", &inter_f::pydeinterpol, deinterpol_DS, "ptg"_a, "data"_a)
// .def ("getSlm", &inter_f::pygetSlm, getSlm_DS, "beam"_a)
// .def ("support", &inter_f::support);
#if 1
m
.
def
(
"rotate_alm"
,
&
pyrotate_alm
<
fptype
>
,
"alm"
_a
,
"lmax"
_a
,
"psi"
_a
,
"theta"
_a
,
"phi"
_a
);
...
...
src/mr_util/math/fft.h
View file @
4b36a29f
...
...
@@ -604,21 +604,6 @@ template<typename T, typename T0> aligned_array<T> alloc_tmp
auto
tmpsize
=
axsize
*
((
othersize
>=
vlen
)
?
vlen
:
1
);
return
aligned_array
<
T
>
(
tmpsize
);
}
template
<
typename
T
,
typename
T0
>
aligned_array
<
T
>
alloc_tmp
(
const
fmav_info
&
info
,
const
shape_t
&
axes
)
{
size_t
fullsize
=
info
.
size
();
size_t
tmpsize
=
0
;
for
(
size_t
i
=
0
;
i
<
axes
.
size
();
++
i
)
{
auto
axsize
=
info
.
shape
(
axes
[
i
]);
auto
othersize
=
fullsize
/
axsize
;
constexpr
auto
vlen
=
native_simd
<
T0
>::
size
();
auto
sz
=
axsize
*
((
othersize
>=
vlen
)
?
vlen
:
1
);
if
(
sz
>
tmpsize
)
tmpsize
=
sz
;
}
return
aligned_array
<
T
>
(
tmpsize
);
}
template
<
typename
T
,
size_t
vlen
>
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
const
fmav
<
Cmplx
<
T
>>
&
src
,
Cmplx
<
native_simd
<
T
>>
*
MRUTIL_RESTRICT
dst
)
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment