Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pypocketfft
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
4
Issues
4
List
Boards
Labels
Service Desk
Milestones
Merge Requests
2
Merge Requests
2
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Martin Reinecke
pypocketfft
Commits
bef3070e
Commit
bef3070e
authored
Aug 04, 2019
by
Martin Reinecke
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'general_cleanup' into 'cleanup2'
More cleanup of `general_` functions See merge request
!20
parents
8a1d38e4
8aa2d5cd
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
177 additions
and
227 deletions
+177
-227
pocketfft_hdronly.h
pocketfft_hdronly.h
+166
-225
test.py
test.py
+11
-2
No files found.
pocketfft_hdronly.h
View file @
bef3070e
...
...
@@ -2812,6 +2812,8 @@ template<> struct VTYPE<long double>
{
using
type
=
long
double
__attribute__
((
vector_size
(
VLEN
<
long
double
>::
val
*
sizeof
(
long
double
))));
};
template
<
typename
T
>
using
vtype_t
=
typename
VTYPE
<
T
>::
type
;
#endif
template
<
typename
T
>
arr
<
char
>
alloc_tmp
(
const
shape_t
&
shape
,
...
...
@@ -2842,187 +2844,177 @@ template<typename T> arr<char> alloc_tmp(const shape_t &shape,
#define POCKETFFT_NTHREADS
#endif
template
<
typename
T
>
POCKETFFT_NOINLINE
void
general_c
(
const
cndarr
<
cmplx
<
T
>>
&
in
,
ndarr
<
cmplx
<
T
>>
&
out
,
const
shape_t
&
axes
,
bool
forward
,
T
fct
,
size_t
POCKETFFT_NTHREADS
)
template
<
typename
T
,
size_t
vlen
>
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
const
cndarr
<
cmplx
<
T
>>
&
src
,
cmplx
<
vtype_t
<
T
>>
*
POCKETFFT_RESTRICT
dst
)
{
shared_ptr
<
pocketfft_c
<
T
>>
plan
;
for
(
size_t
i
=
0
;
i
<
it
.
length_in
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
dst
[
i
].
r
[
j
]
=
src
[
it
.
iofs
(
j
,
i
)].
r
;
dst
[
i
].
i
[
j
]
=
src
[
it
.
iofs
(
j
,
i
)].
i
;
}
}
for
(
size_t
iax
=
0
;
iax
<
axes
.
size
();
++
iax
)
{
constexpr
auto
vlen
=
VLEN
<
T
>::
val
;
size_t
len
=
in
.
shape
(
axes
[
iax
]);
if
((
!
plan
)
||
(
len
!=
plan
->
length
()))
plan
=
get_plan
<
pocketfft_c
<
T
>>
(
len
);
template
<
typename
T
,
size_t
vlen
>
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
const
cndarr
<
T
>
&
src
,
vtype_t
<
T
>
*
POCKETFFT_RESTRICT
dst
)
{
for
(
size_t
i
=
0
;
i
<
it
.
length_in
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
dst
[
i
][
j
]
=
src
[
it
.
iofs
(
j
,
i
)];
}
#ifdef POCKETFFT_OPENMP
#pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax]))
#endif
{
auto
storage
=
alloc_tmp
<
T
>
(
in
.
shape
(),
len
,
sizeof
(
cmplx
<
T
>
));
const
auto
&
tin
(
iax
==
0
?
in
:
out
);
multi_iter
<
vlen
>
it
(
tin
,
out
,
axes
[
iax
]);
#ifndef POCKETFFT_NO_VECTORS
if
(
vlen
>
1
)
while
(
it
.
remaining
()
>=
vlen
)
{
using
vtype
=
typename
VTYPE
<
T
>::
type
;
it
.
advance
(
vlen
);
auto
tdatav
=
reinterpret_cast
<
cmplx
<
vtype
>
*>
(
storage
.
data
());
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
tdatav
[
i
].
r
[
j
]
=
tin
[
it
.
iofs
(
j
,
i
)].
r
;
tdatav
[
i
].
i
[
j
]
=
tin
[
it
.
iofs
(
j
,
i
)].
i
;
}
plan
->
exec
(
tdatav
,
fct
,
forward
);
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
out
[
it
.
oofs
(
j
,
i
)].
Set
(
tdatav
[
i
].
r
[
j
],
tdatav
[
i
].
i
[
j
]);
}
#endif
while
(
it
.
remaining
()
>
0
)
{
it
.
advance
(
1
);
auto
buf
=
it
.
stride_out
()
==
sizeof
(
cmplx
<
T
>
)
?
&
out
[
it
.
oofs
(
0
)]
:
reinterpret_cast
<
cmplx
<
T
>
*>
(
storage
.
data
());
template
<
typename
T
,
size_t
vlen
>
void
copy_input
(
const
multi_iter
<
vlen
>
&
it
,
const
cndarr
<
T
>
&
src
,
T
*
POCKETFFT_RESTRICT
dst
)
{
if
(
dst
==
&
src
[
it
.
iofs
(
0
)])
return
;
// in-place
for
(
size_t
i
=
0
;
i
<
it
.
length_in
();
++
i
)
dst
[
i
]
=
src
[
it
.
iofs
(
i
)];
}
if
(
buf
!=
&
tin
[
it
.
iofs
(
0
)])
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
buf
[
i
]
=
tin
[
it
.
iofs
(
i
)];
plan
->
exec
(
buf
,
fct
,
forward
);
if
(
buf
!=
&
out
[
it
.
oofs
(
0
)])
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
out
[
it
.
oofs
(
i
)]
=
buf
[
i
];
}
}
// end of parallel region
fct
=
T
(
1
);
// factor has been applied, use 1 for remaining axes
}
template
<
typename
T
,
size_t
vlen
>
void
copy_output
(
const
multi_iter
<
vlen
>
&
it
,
const
cmplx
<
vtype_t
<
T
>>
*
POCKETFFT_RESTRICT
src
,
ndarr
<
cmplx
<
T
>>
&
dst
)
{
for
(
size_t
i
=
0
;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
dst
[
it
.
oofs
(
j
,
i
)].
Set
(
src
[
i
].
r
[
j
],
src
[
i
].
i
[
j
]);
}
template
<
typename
T
>
POCKETFFT_NOINLINE
void
general_hartley
(
const
cndarr
<
T
>
&
in
,
ndarr
<
T
>
&
out
,
const
shape_t
&
axes
,
T
fct
,
size_t
POCKETFFT_NTHREADS
)
template
<
typename
T
,
size_t
vlen
>
void
copy_output
(
const
multi_iter
<
vlen
>
&
it
,
const
vtype_t
<
T
>
*
POCKETFFT_RESTRICT
src
,
ndarr
<
T
>
&
dst
)
{
for
(
size_t
i
=
0
;
i
<
it
.
length_out
();
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
dst
[
it
.
oofs
(
j
,
i
)]
=
src
[
i
][
j
];
}
template
<
typename
T
,
size_t
vlen
>
void
copy_output
(
const
multi_iter
<
vlen
>
&
it
,
const
T
*
POCKETFFT_RESTRICT
src
,
ndarr
<
T
>
&
dst
)
{
if
(
src
==
&
dst
[
it
.
oofs
(
0
)])
return
;
// in-place
for
(
size_t
i
=
0
;
i
<
it
.
length_out
();
++
i
)
dst
[
it
.
oofs
(
i
)]
=
src
[
i
];
}
template
<
typename
T
>
struct
add_vec
{
using
type
=
vtype_t
<
T
>
;
};
template
<
typename
T
>
struct
add_vec
<
cmplx
<
T
>>
{
using
type
=
cmplx
<
vtype_t
<
T
>>
;
};
template
<
typename
T
>
using
add_vec_t
=
typename
add_vec
<
T
>::
type
;
template
<
typename
Tplan
,
typename
T
,
typename
T0
,
typename
Exec
>
POCKETFFT_NOINLINE
void
general_nd
(
const
cndarr
<
T
>
&
in
,
ndarr
<
T
>
&
out
,
const
shape_t
&
axes
,
T0
fct
,
size_t
POCKETFFT_NTHREADS
,
const
Exec
&
exec
,
const
bool
allow_inplace
=
true
)
{
shared_ptr
<
pocketfft_r
<
T
>
>
plan
;
shared_ptr
<
Tplan
>
plan
;
for
(
size_t
iax
=
0
;
iax
<
axes
.
size
();
++
iax
)
{
constexpr
auto
vlen
=
VLEN
<
T
>::
val
;
constexpr
auto
vlen
=
VLEN
<
T
0
>::
val
;
size_t
len
=
in
.
shape
(
axes
[
iax
]);
if
((
!
plan
)
||
(
len
!=
plan
->
length
()))
plan
=
get_plan
<
pocketfft_r
<
T
>
>
(
len
);
plan
=
get_plan
<
Tplan
>
(
len
);
#ifdef POCKETFFT_OPENMP
#pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax]))
#endif
{
auto
storage
=
alloc_tmp
<
T
>
(
in
.
shape
(),
len
,
sizeof
(
T
));
const
auto
&
tin
(
iax
==
0
?
in
:
out
);
auto
storage
=
alloc_tmp
<
T
0
>
(
in
.
shape
(),
len
,
sizeof
(
T
));
const
auto
&
tin
(
iax
==
0
?
in
:
out
);
multi_iter
<
vlen
>
it
(
tin
,
out
,
axes
[
iax
]);
#ifndef POCKETFFT_NO_VECTORS
if
(
vlen
>
1
)
while
(
it
.
remaining
()
>=
vlen
)
{
using
vtype
=
typename
VTYPE
<
T
>::
type
;
it
.
advance
(
vlen
);
auto
tdatav
=
reinterpret_cast
<
vtype
*>
(
storage
.
data
());
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
tdatav
[
i
][
j
]
=
tin
[
it
.
iofs
(
j
,
i
)];
plan
->
exec
(
tdatav
,
fct
,
true
);
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
out
[
it
.
oofs
(
j
,
0
)]
=
tdatav
[
0
][
j
];
size_t
i
=
1
,
i1
=
1
,
i2
=
len
-
1
;
for
(
i
=
1
;
i
<
len
-
1
;
i
+=
2
,
++
i1
,
--
i2
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
out
[
it
.
oofs
(
j
,
i1
)]
=
tdatav
[
i
][
j
]
+
tdatav
[
i
+
1
][
j
];
out
[
it
.
oofs
(
j
,
i2
)]
=
tdatav
[
i
][
j
]
-
tdatav
[
i
+
1
][
j
];
}
if
(
i
<
len
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
out
[
it
.
oofs
(
j
,
i1
)]
=
tdatav
[
i
][
j
];
auto
tdatav
=
reinterpret_cast
<
add_vec_t
<
T
>
*>
(
storage
.
data
());
exec
(
it
,
tin
,
out
,
tdatav
,
*
plan
,
fct
);
}
#endif
while
(
it
.
remaining
()
>
0
)
{
it
.
advance
(
1
);
auto
tdata
=
reinterpret_cast
<
T
*>
(
storage
.
data
());
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
tdata
[
i
]
=
tin
[
it
.
iofs
(
i
)];
plan
->
exec
(
tdata
,
fct
,
true
);
// Hartley order
out
[
it
.
oofs
(
0
)]
=
tdata
[
0
];
size_t
i
=
1
,
i1
=
1
,
i2
=
len
-
1
;
for
(
i
=
1
;
i
<
len
-
1
;
i
+=
2
,
++
i1
,
--
i2
)
{
out
[
it
.
oofs
(
i1
)]
=
tdata
[
i
]
+
tdata
[
i
+
1
];
out
[
it
.
oofs
(
i2
)]
=
tdata
[
i
]
-
tdata
[
i
+
1
];
}
if
(
i
<
len
)
out
[
it
.
oofs
(
i1
)]
=
tdata
[
i
];
auto
buf
=
allow_inplace
&&
it
.
stride_out
()
==
sizeof
(
T
)
?
&
out
[
it
.
oofs
(
0
)]
:
reinterpret_cast
<
T
*>
(
storage
.
data
());
exec
(
it
,
tin
,
out
,
buf
,
*
plan
,
fct
);
}
}
// end of parallel region
fct
=
T
(
1
);
// factor has been applied, use 1 for remaining axes
fct
=
T
0
(
1
);
// factor has been applied, use 1 for remaining axes
}
}
template
<
typename
Trafo
,
typename
T
>
POCKETFFT_NOINLINE
void
general_dcst
(
const
cndarr
<
T
>
&
in
,
ndarr
<
T
>
&
out
,
const
shape_t
&
axes
,
T
fct
,
bool
ortho
,
int
type
,
bool
cosine
,
size_t
POCKETFFT_NTHREADS
)
struct
ExecC2C
{
shared_ptr
<
Trafo
>
plan
;
bool
forward
;
for
(
size_t
iax
=
0
;
iax
<
axes
.
size
();
++
iax
)
template
<
typename
T0
,
typename
T
,
size_t
vlen
>
void
operator
()
(
const
multi_iter
<
vlen
>
&
it
,
const
cndarr
<
cmplx
<
T0
>>
&
in
,
ndarr
<
cmplx
<
T0
>>
&
out
,
T
*
buf
,
const
pocketfft_c
<
T0
>
&
plan
,
T0
fct
)
const
{
constexpr
auto
vlen
=
VLEN
<
T
>::
val
;
size_t
len
=
in
.
shape
(
axes
[
iax
]);
if
((
!
plan
)
||
(
len
!=
plan
->
length
()))
plan
=
get_plan
<
Trafo
>
(
len
);
copy_input
(
it
,
in
,
buf
);
plan
.
exec
(
buf
,
fct
,
forward
);
copy_output
(
it
,
buf
,
out
);
}
};
#ifdef POCKETFFT_OPENMP
#pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax]))
#endif
{
auto
storage
=
alloc_tmp
<
T
>
(
in
.
shape
(),
len
,
sizeof
(
T
));
const
auto
&
tin
(
iax
==
0
?
in
:
out
);
multi_iter
<
vlen
>
it
(
tin
,
out
,
axes
[
iax
]);
#ifndef POCKETFFT_NO_VECTORS
if
(
vlen
>
1
)
while
(
it
.
remaining
()
>=
vlen
)
{
using
vtype
=
typename
VTYPE
<
T
>::
type
;
it
.
advance
(
vlen
);
auto
tdatav
=
reinterpret_cast
<
vtype
*>
(
storage
.
data
());
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
tdatav
[
i
][
j
]
=
tin
[
it
.
iofs
(
j
,
i
)];
plan
->
exec
(
tdatav
,
fct
,
ortho
,
type
,
cosine
);
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
out
[
it
.
oofs
(
j
,
i
)]
=
tdatav
[
i
][
j
];
}
#endif
while
(
it
.
remaining
()
>
0
)
{
it
.
advance
(
1
);
auto
buf
=
it
.
stride_out
()
==
sizeof
(
T
)
?
&
out
[
it
.
oofs
(
0
)]
:
reinterpret_cast
<
T
*>
(
storage
.
data
());
template
<
typename
T
,
size_t
vlen
>
void
copy_hartley
(
const
multi_iter
<
vlen
>
&
it
,
const
vtype_t
<
T
>
*
POCKETFFT_RESTRICT
src
,
ndarr
<
T
>
&
dst
)
{
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
dst
[
it
.
oofs
(
j
,
0
)]
=
src
[
0
][
j
];
size_t
i
=
1
,
i1
=
1
,
i2
=
it
.
length_out
()
-
1
;
for
(
i
=
1
;
i
<
it
.
length_out
()
-
1
;
i
+=
2
,
++
i1
,
--
i2
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
{
dst
[
it
.
oofs
(
j
,
i1
)]
=
src
[
i
][
j
]
+
src
[
i
+
1
][
j
];
dst
[
it
.
oofs
(
j
,
i2
)]
=
src
[
i
][
j
]
-
src
[
i
+
1
][
j
];
}
if
(
i
<
it
.
length_out
())
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
dst
[
it
.
oofs
(
j
,
i1
)]
=
src
[
i
][
j
];
}
if
(
buf
!=
&
tin
[
it
.
iofs
(
0
)])
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
buf
[
i
]
=
tin
[
it
.
iofs
(
i
)];
plan
->
exec
(
buf
,
fct
,
ortho
,
type
,
cosine
);
if
(
buf
!=
&
out
[
it
.
oofs
(
0
)])
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
out
[
it
.
oofs
(
i
)]
=
buf
[
i
];
}
}
// end of parallel region
fct
=
T
(
1
);
// factor has been applied, use 1 for remaining axes
template
<
typename
T
,
size_t
vlen
>
void
copy_hartley
(
const
multi_iter
<
vlen
>
&
it
,
const
T
*
POCKETFFT_RESTRICT
src
,
ndarr
<
T
>
&
dst
)
{
dst
[
it
.
oofs
(
0
)]
=
src
[
0
];
size_t
i
=
1
,
i1
=
1
,
i2
=
it
.
length_out
()
-
1
;
for
(
i
=
1
;
i
<
it
.
length_out
()
-
1
;
i
+=
2
,
++
i1
,
--
i2
)
{
dst
[
it
.
oofs
(
i1
)]
=
src
[
i
]
+
src
[
i
+
1
];
dst
[
it
.
oofs
(
i2
)]
=
src
[
i
]
-
src
[
i
+
1
];
}
if
(
i
<
it
.
length_out
())
dst
[
it
.
oofs
(
i1
)]
=
src
[
i
];
}
struct
ExecHartley
{
template
<
typename
T0
,
typename
T
,
size_t
vlen
>
void
operator
()
(
const
multi_iter
<
vlen
>
&
it
,
const
cndarr
<
T0
>
&
in
,
ndarr
<
T0
>
&
out
,
T
*
buf
,
const
pocketfft_r
<
T0
>
&
plan
,
T0
fct
)
const
{
copy_input
(
it
,
in
,
buf
);
plan
.
exec
(
buf
,
fct
,
true
);
copy_hartley
(
it
,
buf
,
out
);
}
};
struct
ExecDcst
{
bool
ortho
;
int
type
;
bool
cosine
;
template
<
typename
T0
,
typename
T
,
typename
Tplan
,
size_t
vlen
>
void
operator
()
(
const
multi_iter
<
vlen
>
&
it
,
const
cndarr
<
T0
>
&
in
,
ndarr
<
T0
>
&
out
,
T
*
buf
,
const
Tplan
&
plan
,
T0
fct
)
const
{
copy_input
(
it
,
in
,
buf
);
plan
.
exec
(
buf
,
fct
,
ortho
,
type
,
cosine
);
copy_output
(
it
,
buf
,
out
);
}
};
template
<
typename
T
>
POCKETFFT_NOINLINE
void
general_r2c
(
const
cndarr
<
T
>
&
in
,
ndarr
<
cmplx
<
T
>>
&
out
,
size_t
axis
,
bool
forward
,
T
fct
,
size_t
POCKETFFT_NTHREADS
)
...
...
@@ -3040,12 +3032,9 @@ template<typename T> POCKETFFT_NOINLINE void general_r2c(
if
(
vlen
>
1
)
while
(
it
.
remaining
()
>=
vlen
)
{
using
vtype
=
typename
VTYPE
<
T
>::
type
;
it
.
advance
(
vlen
);
auto
tdatav
=
reinterpret_cast
<
vtype
*>
(
storage
.
data
());
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
tdatav
[
i
][
j
]
=
in
[
it
.
iofs
(
j
,
i
)];
auto
tdatav
=
reinterpret_cast
<
vtype_t
<
T
>
*>
(
storage
.
data
());
copy_input
(
it
,
in
,
tdatav
);
plan
->
exec
(
tdatav
,
fct
,
true
);
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
out
[
it
.
oofs
(
j
,
0
)].
Set
(
tdatav
[
0
][
j
]);
...
...
@@ -3067,8 +3056,7 @@ template<typename T> POCKETFFT_NOINLINE void general_r2c(
{
it
.
advance
(
1
);
auto
tdata
=
reinterpret_cast
<
T
*>
(
storage
.
data
());
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
tdata
[
i
]
=
in
[
it
.
iofs
(
i
)];
copy_input
(
it
,
in
,
tdata
);
plan
->
exec
(
tdata
,
fct
,
true
);
out
[
it
.
oofs
(
0
)].
Set
(
tdata
[
0
]);
size_t
i
=
1
,
ii
=
1
;
...
...
@@ -3100,9 +3088,8 @@ template<typename T> POCKETFFT_NOINLINE void general_c2r(
if
(
vlen
>
1
)
while
(
it
.
remaining
()
>=
vlen
)
{
using
vtype
=
typename
VTYPE
<
T
>::
type
;
it
.
advance
(
vlen
);
auto
tdatav
=
reinterpret_cast
<
vtype
*>
(
storage
.
data
());
auto
tdatav
=
reinterpret_cast
<
vtype
_t
<
T
>
*>
(
storage
.
data
());
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
tdatav
[
0
][
j
]
=
in
[
it
.
iofs
(
j
,
0
)].
r
;
{
...
...
@@ -3120,9 +3107,7 @@ template<typename T> POCKETFFT_NOINLINE void general_c2r(
tdatav
[
i
][
j
]
=
in
[
it
.
iofs
(
j
,
ii
)].
r
;
}
plan
->
exec
(
tdatav
,
fct
,
false
);
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
out
[
it
.
oofs
(
j
,
i
)]
=
tdatav
[
i
][
j
];
copy_output
(
it
,
tdatav
,
out
);
}
#endif
while
(
it
.
remaining
()
>
0
)
...
...
@@ -3142,78 +3127,30 @@ template<typename T> POCKETFFT_NOINLINE void general_c2r(
tdata
[
i
]
=
in
[
it
.
iofs
(
ii
)].
r
;
}
plan
->
exec
(
tdata
,
fct
,
false
);
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
out
[
it
.
oofs
(
i
)]
=
tdata
[
i
];
copy_output
(
it
,
tdata
,
out
);
}
}
// end of parallel region
}
template
<
typename
T
>
POCKETFFT_NOINLINE
void
general_r
(
const
cndarr
<
T
>
&
in
,
ndarr
<
T
>
&
out
,
const
shape_t
&
axes
,
bool
r2c
,
bool
forward
,
T
fct
,
size_t
POCKETFFT_NTHREADS
)
struct
ExecR2R
{
shared_ptr
<
pocketfft_r
<
T
>>
plan
;
bool
r2c
,
forward
;
for
(
size_t
iax
=
0
;
iax
<
axes
.
size
();
++
iax
)
template
<
typename
T0
,
typename
T
,
size_t
vlen
>
void
operator
()
(
const
multi_iter
<
vlen
>
&
it
,
const
cndarr
<
T0
>
&
in
,
ndarr
<
T0
>
&
out
,
T
*
buf
,
const
pocketfft_r
<
T0
>
&
plan
,
T0
fct
)
const
{
constexpr
auto
vlen
=
VLEN
<
T
>::
val
;
size_t
len
=
in
.
shape
(
axes
[
iax
]);
if
((
!
plan
)
||
(
len
!=
plan
->
length
()))
plan
=
get_plan
<
pocketfft_r
<
T
>>
(
len
);
#ifdef POCKETFFT_OPENMP
#pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax]))
#endif
{
auto
storage
=
alloc_tmp
<
T
>
(
in
.
shape
(),
len
,
sizeof
(
T
));
const
auto
&
tin
(
iax
==
0
?
in
:
out
);
multi_iter
<
vlen
>
it
(
tin
,
out
,
axes
[
iax
]);
#ifndef POCKETFFT_NO_VECTORS
if
(
vlen
>
1
)
while
(
it
.
remaining
()
>=
vlen
)
{
using
vtype
=
typename
VTYPE
<
T
>::
type
;
it
.
advance
(
vlen
);
auto
tdatav
=
reinterpret_cast
<
vtype
*>
(
storage
.
data
());
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
tdatav
[
i
][
j
]
=
tin
[
it
.
iofs
(
j
,
i
)];
if
((
!
r2c
)
&&
forward
)
for
(
size_t
i
=
2
;
i
<
len
;
i
+=
2
)
tdatav
[
i
]
=
-
tdatav
[
i
];
plan
->
exec
(
tdatav
,
fct
,
forward
);
if
(
r2c
&&
(
!
forward
))
for
(
size_t
i
=
2
;
i
<
len
;
i
+=
2
)
tdatav
[
i
]
=
-
tdatav
[
i
];
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
for
(
size_t
j
=
0
;
j
<
vlen
;
++
j
)
out
[
it
.
oofs
(
j
,
i
)]
=
tdatav
[
i
][
j
];
}
#endif
while
(
it
.
remaining
()
>
0
)
{
it
.
advance
(
1
);
auto
buf
=
it
.
stride_out
()
==
sizeof
(
T
)
?
&
out
[
it
.
oofs
(
0
)]
:
reinterpret_cast
<
T
*>
(
storage
.
data
());
if
(
buf
!=
&
tin
[
it
.
iofs
(
0
)])
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
buf
[
i
]
=
tin
[
it
.
iofs
(
i
)];
if
((
!
r2c
)
&&
forward
)
for
(
size_t
i
=
2
;
i
<
len
;
i
+=
2
)
buf
[
i
]
=
-
buf
[
i
];
plan
->
exec
(
buf
,
fct
,
forward
);
if
(
r2c
&&
(
!
forward
))
for
(
size_t
i
=
2
;
i
<
len
;
i
+=
2
)
buf
[
i
]
=
-
buf
[
i
];
if
(
buf
!=
&
out
[
it
.
oofs
(
0
)])
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
out
[
it
.
oofs
(
i
)]
=
buf
[
i
];
}
}
// end of parallel region
fct
=
T
(
1
);
// factor has been applied, use 1 for remaining axes
copy_input
(
it
,
in
,
buf
);
if
((
!
r2c
)
&&
forward
)
for
(
size_t
i
=
2
;
i
<
it
.
length_out
();
i
+=
2
)
buf
[
i
]
=
-
buf
[
i
];
plan
.
exec
(
buf
,
fct
,
forward
);
if
(
r2c
&&
(
!
forward
))
for
(
size_t
i
=
2
;
i
<
it
.
length_out
();
i
+=
2
)
buf
[
i
]
=
-
buf
[
i
];
copy_output
(
it
,
buf
,
out
);
}
}
}
;
#undef POCKETFFT_NTHREADS
...
...
@@ -3226,7 +3163,7 @@ template<typename T> void c2c(const shape_t &shape, const stride_t &stride_in,
util
::
sanity_check
(
shape
,
stride_in
,
stride_out
,
data_in
==
data_out
,
axes
);
cndarr
<
cmplx
<
T
>>
ain
(
data_in
,
shape
,
stride_in
);
ndarr
<
cmplx
<
T
>>
aout
(
data_out
,
shape
,
stride_out
);
general_
c
(
ain
,
aout
,
axes
,
forward
,
fct
,
nthreads
);
general_
nd
<
pocketfft_c
<
T
>>
(
ain
,
aout
,
axes
,
fct
,
nthreads
,
ExecC2C
{
forward
}
);
}
template
<
typename
T
>
void
dct
(
const
shape_t
&
shape
,
...
...
@@ -3238,12 +3175,13 @@ template<typename T> void dct(const shape_t &shape,
util
::
sanity_check
(
shape
,
stride_in
,
stride_out
,
data_in
==
data_out
,
axes
);
cndarr
<
T
>
ain
(
data_in
,
shape
,
stride_in
);
ndarr
<
T
>
aout
(
data_out
,
shape
,
stride_out
);
const
ExecDcst
exec
{
ortho
,
type
,
true
};
if
(
type
==
1
)
general_
dcst
<
T_dct1
<
T
>>
(
ain
,
aout
,
axes
,
fct
,
ortho
,
type
,
true
,
nthreads
);
general_
nd
<
T_dct1
<
T
>>
(
ain
,
aout
,
axes
,
fct
,
nthreads
,
exec
);
else
if
(
type
==
4
)
general_
dcst
<
T_dcst4
<
T
>>
(
ain
,
aout
,
axes
,
fct
,
ortho
,
type
,
true
,
nthreads
);
general_
nd
<
T_dcst4
<
T
>>
(
ain
,
aout
,
axes
,
fct
,
nthreads
,
exec
);
else
general_
dcst
<
T_dcst23
<
T
>>
(
ain
,
aout
,
axes
,
fct
,
ortho
,
type
,
true
,
nthreads
);
general_
nd
<
T_dcst23
<
T
>>
(
ain
,
aout
,
axes
,
fct
,
nthreads
,
exec
);
}
template
<
typename
T
>
void
dst
(
const
shape_t
&
shape
,
...
...
@@ -3255,12 +3193,13 @@ template<typename T> void dst(const shape_t &shape,
util
::
sanity_check
(
shape
,
stride_in
,
stride_out
,
data_in
==
data_out
,
axes
);
cndarr
<
T
>
ain
(
data_in
,
shape
,
stride_in
);
ndarr
<
T
>
aout
(
data_out
,
shape
,
stride_out
);
const
ExecDcst
exec
{
ortho
,
type
,
false
};
if
(
type
==
1
)
general_
dcst
<
T_dst1
<
T
>>
(
ain
,
aout
,
axes
,
fct
,
ortho
,
type
,
false
,
nthreads
);
general_
nd
<
T_dst1
<
T
>>
(
ain
,
aout
,
axes
,
fct
,
nthreads
,
exec
);
else
if
(
type
==
4
)
general_
dcst
<
T_dcst4
<
T
>>
(
ain
,
aout
,
axes
,
fct
,
ortho
,
type
,
false
,
nthreads
);
general_
nd
<
T_dcst4
<
T
>>
(
ain
,
aout
,
axes
,
fct
,
nthreads
,
exec
);
else
general_
dcst
<
T_dcst23
<
T
>>
(
ain
,
aout
,
axes
,
fct
,
ortho
,
type
,
false
,
nthreads
);
general_
nd
<
T_dcst23
<
T
>>
(
ain
,
aout
,
axes
,
fct
,
nthreads
,
exec
);
}
template
<
typename
T
>
void
r2c
(
const
shape_t
&
shape_in
,
...
...
@@ -3344,7 +3283,8 @@ template<typename T> void r2r_fftpack(const shape_t &shape,
util
::
sanity_check
(
shape
,
stride_in
,
stride_out
,
data_in
==
data_out
,
axes
);
cndarr
<
T
>
ain
(
data_in
,
shape
,
stride_in
);
ndarr
<
T
>
aout
(
data_out
,
shape
,
stride_out
);
general_r
(
ain
,
aout
,
axes
,
real2hermitian
,
forward
,
fct
,
nthreads
);
general_nd
<
pocketfft_r
<
T
>>
(
ain
,
aout
,
axes
,
fct
,
nthreads
,
ExecR2R
{
real2hermitian
,
forward
});
}
template
<
typename
T
>
void
r2r_separable_hartley
(
const
shape_t
&
shape
,
...
...
@@ -3355,7 +3295,8 @@ template<typename T> void r2r_separable_hartley(const shape_t &shape,
util
::
sanity_check
(
shape
,
stride_in
,
stride_out
,
data_in
==
data_out
,
axes
);
cndarr
<
T
>
ain
(
data_in
,
shape
,
stride_in
);
ndarr
<
T
>
aout
(
data_out
,
shape
,
stride_out
);
general_hartley
(
ain
,
aout
,
axes
,
fct
,
nthreads
);
general_nd
<
pocketfft_r
<
T
>>
(
ain
,
aout
,
axes
,
fct
,
nthreads
,
ExecHartley
{},
false
);
}
}
// namespace detail
...
...
test.py
View file @
bef3070e
...
...
@@ -58,9 +58,18 @@ def irfft_scipy(a, axis, inorm=0, out=None, nthreads=1):
tol
=
{
np
.
float32
:
6e-7
,
np
.
float64
:
1.5e-15
,
np
.
longfloat
:
1e-18
}
ctype
=
{
np
.
float32
:
np
.
complex64
,
np
.
float64
:
np
.
complex128
,
np
.
longfloat
:
np
.
longcomplex
}
import
platform
on_wsl
=
"microsoft"
in
platform
.
uname
()[
3
].
lower
()
true_long_double
=
(
np
.
longfloat
!=
np
.
float64
and
not
on_wsl
)
dtypes
=
[
np
.
float32
,
np
.
float64
,
pytest
.
param
(
np
.
longfloat
,
marks
=
pytest
.
mark
.
xfail
(
not
true_long_double
,
reason
=
"Long double doesn't offer more precision"
))]
@
pmp
(
"len"
,
len1D
)
@
pmp
(
"inorm"
,
[
0
,
1
,
2
])
@
pmp
(
"dtype"
,
[
np
.
float32
,
np
.
float64
,
np
.
longfloat
]
)
@
pmp
(
"dtype"
,
dtypes
)
def
test1D
(
len
,
inorm
,
dtype
):
a
=
np
.
random
.
rand
(
len
)
-
0.5
+
1j
*
np
.
random
.
rand
(
len
)
-
0.5j
a
=
a
.
astype
(
ctype
[
dtype
])
...
...
@@ -199,7 +208,7 @@ def test_genuine_hartley_2D(shp, axes):
@
pmp
(
"len"
,
len1D
)
@
pmp
(
"inorm"
,
[
0
,
1
])
# inorm==2 not needed, tested via inverse
@
pmp
(
"type"
,
[
1
,
2
,
3
,
4
])
@
pmp
(
"dtype"
,
[
np
.
float32
,
np
.
float64
,
np
.
longfloat
]
)
@
pmp
(
"dtype"
,
dtypes
)
def
testdcst1D
(
len
,
inorm
,
type
,
dtype
):
a
=
(
np
.
random
.
rand
(
len
)
-
0.5
).
astype
(
dtype
)
eps
=
tol
[
dtype
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment