Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Simon Perkins
ducc
Commits
513c1b5b
Commit
513c1b5b
authored
Jan 07, 2020
by
Martin Reinecke
Browse files
introduce fallback SIMD functionality
parent
6d68d971
Changes
3
Hide whitespace changes
Inline
Side-by-side
libsharp2/sharp_core_inc.cc
View file @
513c1b5b
...
...
@@ -37,26 +37,46 @@
#include
<complex>
#include
<math.h>
#include
<string.h>
#include
<experimental/simd>
#include
"libsharp2/sharp.h"
#include
"libsharp2/sharp_internal.h"
#include
"libsharp2/sharp_utils.h"
#include
"mr_util/error_handling.h"
#include
"mr_util/useful_macros.h"
#include
"mr_util/simd.h"
#pragma GCC visibility push(hidden)
using
namespace
std
::
experimental
;
using
Tv
=
std
::
experimental
::
native_simd
<
double
>
;
static
constexpr
size_t
VLEN
=
Tv
::
size
();
#if (defined(__AVX__) && (!defined(__AVX512F__)))
static
inline
void
vhsum_cmplx_special
(
Tv
a
,
Tv
b
,
Tv
c
,
Tv
d
,
complex
<
double
>
*
MRUTIL_RESTRICT
cc
)
{
auto
tmp1
=
_mm256_hadd_pd
(
__data
(
a
),
__data
(
b
)),
tmp2
=
_mm256_hadd_pd
(
__data
(
c
),
__data
(
d
));
auto
tmp3
=
_mm256_permute2f128_pd
(
tmp1
,
tmp2
,
49
),
tmp4
=
_mm256_permute2f128_pd
(
tmp1
,
tmp2
,
32
);
tmp1
=
tmp3
+
tmp4
;
union
U
{
decltype
(
tmp1
)
v
;
complex
<
double
>
c
[
2
];
U
()
{}
};
U
u
;
u
.
v
=
tmp1
;
cc
[
0
]
+=
u
.
c
[
0
];
cc
[
1
]
+=
u
.
c
[
1
];
}
#else
static
inline
void
vhsum_cmplx_special
(
Tv
a
,
Tv
b
,
Tv
c
,
Tv
d
,
complex
<
double
>
*
MRUTIL_RESTRICT
cc
)
{
using
std
::
experimental
::
reduce
;
cc
[
0
]
+=
complex
<
double
>
(
reduce
(
a
,
std
::
plus
<>
()),
reduce
(
b
,
std
::
plus
<>
()));
cc
[
1
]
+=
complex
<
double
>
(
reduce
(
c
,
std
::
plus
<>
()),
reduce
(
d
,
std
::
plus
<>
()));
}
#endif
// In the following, we explicitly allow the compiler to contract floating
// point operations, like multiply-and-add.
// Unfortunately, most compilers don't act on this pragma yet.
...
...
mr_util/fft.h
View file @
513c1b5b
...
...
@@ -50,15 +50,16 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include
<memory>
#include
<vector>
#include
<complex>
#include
<algorithm>
#if POCKETFFT_CACHE_SIZE!=0
#include
<array>
#endif
#include
"mr_util/threading.h"
#include
<experimental/simd>
#include
"mr_util/cmplx.h"
#include
"mr_util/aligned_array.h"
#include
"mr_util/unity_roots.h"
#include
"mr_util/useful_macros.h"
#include
"mr_util/simd.h"
namespace
mr
{
...
...
@@ -258,7 +259,7 @@ struct util // hack to avoid duplicate symbols
parallel
/=
4
;
size_t
max_threads
=
nthreads
==
0
?
thread
::
hardware_concurrency
()
:
nthreads
;
return
max
(
size_t
(
1
),
min
(
parallel
,
max_threads
));
return
std
::
max
(
size_t
(
1
),
std
::
min
(
parallel
,
max_threads
));
}
#endif
};
...
...
@@ -2498,7 +2499,11 @@ template<> struct VTYPE<double>
};
template
<
>
struct
VTYPE
<
long
double
>
{
#ifdef MRUTIL_HOMEGROWN_SIMD
using
type
=
detail_simd
::
vtp
<
long
double
,
1
>
;
#else
using
type
=
simd
<
long
double
,
simd_abi
::
fixed_size
<
1
>>
;
#endif
};
#endif
...
...
mr_util/simd.h
0 → 100644
View file @
513c1b5b
/*
* This file is part of the MR utility library.
*
* This code is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This code is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this code; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/* Copyright (C) 2019 Max-Planck-Society
Author: Martin Reinecke */
#ifndef MRUTIL_SIMD_H
#define MRUTIL_SIMD_H
#define MRUTIL_HOMEGROWN_SIMD
#ifndef MRUTIL_HOMEGROWN_SIMD
#include
<experimental/simd>
#else
// only enable SIMD support for gcc>=5.0 and clang>=5.0
#ifndef MRUTIL_NO_SIMD
#define MRUTIL_NO_SIMD
#if defined(__INTEL_COMPILER)
// do nothing. This is necessary because this compiler also sets __GNUC__.
#elif defined(__clang__)
#if __clang_major__>=5
#undef MRUTIL_NO_SIMD
#endif
#elif defined(__GNUC__)
#if __GNUC__>=5
#undef MRUTIL_NO_SIMD
#endif
#endif
#endif
#ifndef MRUTIL_NO_SIMD
#include
<cstdlib>
#include
<cmath>
#include
<x86intrin.h>
namespace
std
{
namespace
experimental
{
namespace
detail_simd
{
#if (defined(__AVX512F__))
constexpr
size_t
vbytes
=
64
;
#elif (defined(__AVX__))
constexpr
size_t
vbytes
=
32
;
#elif (defined(__SSE2__))
constexpr
size_t
vbytes
=
16
;
#elif (defined(__VSX__))
constexpr
size_t
vbytes
=
16
;
#endif
template
<
size_t
ibytes
>
struct
Itype
{};
template
<
>
struct
Itype
<
4
>
{
using
type
=
int32_t
;
};
template
<
>
struct
Itype
<
8
>
{
using
type
=
int64_t
;
};
template
<
typename
T
,
size_t
len
=
vbytes
/
sizeof
(
T
)>
class
vtp
{
public:
using
Tv
[[
gnu
::
vector_size
(
len
*
sizeof
(
T
))]]
=
T
;
static_assert
((
len
>
0
)
&&
((
len
&
(
len
-
1
))
==
0
),
"bad vector length"
);
Tv
v
;
inline
void
from_scalar
(
const
T
&
other
)
{
v
=
v
*
0
+
other
;
}
public:
using
mask_type
=
vtp
<
typename
Itype
<
sizeof
(
T
)
>::
type
,
len
>
;
static
constexpr
size_t
size
()
{
return
len
;
}
vtp
()
=
default
;
vtp
(
T
other
)
{
from_scalar
(
other
);
}
vtp
(
const
Tv
&
other
)
:
v
(
other
)
{}
vtp
(
const
vtp
&
other
)
=
default
;
vtp
&
operator
=
(
const
T
&
other
)
{
from_scalar
(
other
);
return
*
this
;
}
vtp
operator
-
()
const
{
return
vtp
(
-
v
);
}
vtp
operator
+
(
vtp
other
)
const
{
return
vtp
(
v
+
other
.
v
);
}
vtp
operator
-
(
vtp
other
)
const
{
return
vtp
(
v
-
other
.
v
);
}
vtp
operator
*
(
vtp
other
)
const
{
return
vtp
(
v
*
other
.
v
);
}
vtp
operator
/
(
vtp
other
)
const
{
return
vtp
(
v
/
other
.
v
);
}
vtp
operator
&
(
vtp
other
)
const
{
return
vtp
(
v
&
other
.
v
);
}
vtp
&
operator
+=
(
vtp
other
)
{
v
+=
other
.
v
;
return
*
this
;
}
vtp
&
operator
-=
(
vtp
other
)
{
v
-=
other
.
v
;
return
*
this
;
}
vtp
&
operator
*=
(
vtp
other
)
{
v
*=
other
.
v
;
return
*
this
;
}
vtp
&
operator
/=
(
vtp
other
)
{
v
/=
other
.
v
;
return
*
this
;
}
inline
vtp
exp
()
const
{
vtp
res
;
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
res
.
v
[
i
]
=
std
::
exp
(
v
[
i
]);
return
res
;
}
inline
vtp
sqrt
()
const
{
vtp
res
;
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
res
.
v
[
i
]
=
std
::
sqrt
(
v
[
i
]);
return
res
;
}
vtp
max
(
const
vtp
&
other
)
const
{
return
vtp
(
v
>
other
.
v
?
v
:
other
.
v
);
}
mask_type
operator
>
(
const
vtp
&
other
)
const
{
return
v
>
other
.
v
;
}
mask_type
operator
>=
(
const
vtp
&
other
)
const
{
return
v
>=
other
.
v
;
}
mask_type
operator
<
(
const
vtp
&
other
)
const
{
return
v
<
other
.
v
;
}
mask_type
operator
!=
(
const
vtp
&
other
)
const
{
return
v
!=
other
.
v
;
}
class
reference
{
private:
vtp
&
v
;
size_t
i
;
public:
reference
(
vtp
&
v_
,
size_t
i_
)
:
v
(
v_
),
i
(
i_
)
{}
reference
&
operator
=
(
T
other
)
{
v
.
v
[
i
]
=
other
;
return
*
this
;
}
reference
&
operator
*=
(
T
other
)
{
v
.
v
[
i
]
*=
other
;
return
*
this
;
}
operator
T
()
const
{
return
v
.
v
[
i
];
}
};
class
where_expr
{
private:
vtp
&
v
;
mask_type
m
;
public:
where_expr
(
mask_type
m_
,
vtp
&
v_
)
:
v
(
v_
),
m
(
m_
)
{}
where_expr
&
operator
*=
(
const
vtp
&
other
)
{
v
=
vtp
(
m
.
v
?
v
.
v
*
other
.
v
:
v
.
v
);
return
*
this
;
}
where_expr
&
operator
+=
(
const
vtp
&
other
)
{
v
=
vtp
(
m
.
v
?
v
.
v
+
other
.
v
:
v
.
v
);
return
*
this
;
}
where_expr
&
operator
-=
(
const
vtp
&
other
)
{
v
=
vtp
(
m
.
v
?
v
.
v
-
other
.
v
:
v
.
v
);
return
*
this
;
}
};
reference
operator
[](
size_t
i
)
{
return
reference
(
*
this
,
i
);
}
T
operator
[](
size_t
i
)
const
{
return
v
[
i
];
}
Tv
__data
()
const
{
return
v
;
}
};
template
<
typename
T
,
size_t
len
>
inline
typename
vtp
<
T
,
len
>::
Tv
__data
(
const
vtp
<
T
,
len
>
&
v
)
{
return
v
.
__data
();
}
template
<
typename
T
,
size_t
len
>
inline
vtp
<
T
,
len
>
abs
(
vtp
<
T
,
len
>
v
)
{
vtp
<
T
,
len
>
res
;
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
res
.
v
[
i
]
=
std
::
abs
(
v
.
v
[
i
]);
return
res
;
}
template
<
typename
T
,
size_t
len
>
typename
vtp
<
T
,
len
>::
where_expr
where
(
typename
vtp
<
T
,
len
>::
mask_type
m
,
vtp
<
T
,
len
>
&
v
)
{
return
typename
vtp
<
T
,
len
>::
where_expr
(
m
,
v
);
}
template
<
typename
T0
,
typename
T
,
size_t
len
>
vtp
<
T
,
len
>
operator
*
(
T0
a
,
vtp
<
T
,
len
>
b
)
{
return
b
*
a
;
}
template
<
typename
T
,
size_t
len
>
vtp
<
T
,
len
>
operator
+
(
T
a
,
vtp
<
T
,
len
>
b
)
{
return
b
+
a
;
}
template
<
typename
T
,
size_t
len
>
vtp
<
T
,
len
>
operator
-
(
T
a
,
vtp
<
T
,
len
>
b
)
{
return
vtp
<
T
,
len
>
(
a
)
-
b
;
}
template
<
typename
T
,
size_t
len
>
vtp
<
T
,
len
>
max
(
vtp
<
T
,
len
>
a
,
vtp
<
T
,
len
>
b
)
{
return
a
.
max
(
b
);
}
template
<
typename
T
,
size_t
len
>
vtp
<
T
,
len
>
sqrt
(
vtp
<
T
,
len
>
v
)
{
return
v
.
sqrt
();
}
template
<
typename
T
>
using
native_simd
=
vtp
<
T
>
;
template
<
typename
Op
,
typename
T
,
size_t
len
>
T
reduce
(
const
vtp
<
T
,
len
>
&
v
,
Op
op
)
{
T
res
=
v
[
0
];
for
(
size_t
i
=
1
;
i
<
len
;
++
i
)
res
=
op
(
res
,
v
[
i
]);
return
res
;
}
template
<
typename
T
,
size_t
len
>
inline
bool
any_of
(
const
vtp
<
T
,
len
>
&
mask
)
{
bool
res
=
false
;
for
(
size_t
i
=
0
;
i
<
mask
.
size
();
++
i
)
res
=
res
||
(
mask
[
i
]
!=
0
);
return
res
;
}
template
<
typename
T
,
size_t
len
>
inline
bool
none_of
(
const
vtp
<
T
,
len
>
&
mask
)
{
return
!
any_of
(
mask
);
}
template
<
typename
T
,
size_t
len
>
inline
bool
all_of
(
const
vtp
<
T
,
len
>
&
mask
)
{
bool
res
=
true
;
for
(
size_t
i
=
0
;
i
<
mask
.
size
();
++
i
)
res
=
res
&&
(
mask
[
i
]
!=
0
);
return
res
;
}
template
<
typename
T
>
inline
native_simd
<
T
>
vload
(
T
v
)
{
return
native_simd
<
T
>
(
v
);
}
#if defined(__AVX512F__)
#elif defined(__AVX__)
template
<
>
inline
void
vtp
<
double
,
4
>::
from_scalar
(
const
double
&
other
)
{
v
=
_mm256_set1_pd
(
other
);
}
template
<
>
inline
vtp
<
double
,
4
>
vtp
<
double
,
4
>::
sqrt
()
const
{
return
vtp
<
double
,
4
>
(
_mm256_sqrt_pd
(
v
));
}
template
<
>
inline
vtp
<
double
,
4
>
vtp
<
double
,
4
>::
operator
-
()
const
{
return
vtp
<
double
,
4
>
(
_mm256_xor_pd
(
_mm256_set1_pd
(
-
0.
),
v
));
}
template
<
>
inline
vtp
<
double
,
4
>
abs
(
vtp
<
double
,
4
>
v
)
{
return
vtp
<
double
,
4
>
(
_mm256_andnot_pd
(
_mm256_set1_pd
(
-
0.
),
v
.
v
));
}
template
<
>
inline
bool
any_of
(
const
vtp
<
int64_t
,
4
>
&
mask
)
{
return
_mm256_movemask_pd
(
__m256d
(
mask
.
v
))
!=
0
;
}
template
<
>
inline
bool
none_of
(
const
vtp
<
int64_t
,
4
>
&
mask
)
{
return
_mm256_movemask_pd
(
__m256d
(
mask
.
v
))
==
0
;
}
template
<
>
inline
bool
all_of
(
const
vtp
<
int64_t
,
4
>
&
mask
)
{
return
_mm256_movemask_pd
(
__m256d
(
mask
.
v
))
==
15
;
}
#endif
}
using
detail_simd
::
native_simd
;
using
detail_simd
::
reduce
;
using
detail_simd
::
max
;
using
detail_simd
::
abs
;
using
detail_simd
::
sqrt
;
using
detail_simd
::
any_of
;
using
detail_simd
::
none_of
;
using
detail_simd
::
all_of
;
using
detail_simd
::
vload
;
}
}
#endif
#endif
#endif
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment