Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
9c352986
Commit
9c352986
authored
Jul 27, 2017
by
Andreas Marek
Browse files
Better debuging of GPU
parent
d2828aed
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/elpa2/compute_hh_trafo.X90
View file @
9c352986
...
...
@@ -51,7 +51,7 @@
&
_
&
#endif
&
PRECISION
&
(
obj
,
a
,
a_dev
,
stripe_width
,
a_dim2
,
stripe_count
,
&
(
obj
,
useGPU
,
wantDebug
,
a
,
a_dev
,
stripe_width
,
a_dim2
,
stripe_count
,
&
#ifdef WITH_OPENMP
max_threads
,
l_nev
,
&
#endif
...
...
@@ -108,81 +108,98 @@
use
elpa_generated_fortran_interfaces
implicit
none
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
real
(
kind
=
c_double
),
intent
(
inout
)
::
kernel_time
! MPI_WTIME always needs double
integer
(
kind
=
lik
)
::
kernel_flops
integer
(
kind
=
ik
),
intent
(
in
)
::
nbw
,
max_blk_size
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
logical
,
intent
(
in
)
::
useGPU
,
wantDebug
real
(
kind
=
c_double
),
intent
(
inout
)
::
kernel_time
! MPI_WTIME always needs double
integer
(
kind
=
lik
)
::
kernel_flops
integer
(
kind
=
ik
),
intent
(
in
)
::
nbw
,
max_blk_size
#if REALCASE == 1
real
(
kind
=
C_DATATYPE_KIND
)
::
bcast_buffer
(
nbw
,
max_blk_size
)
real
(
kind
=
C_DATATYPE_KIND
)
::
bcast_buffer
(
nbw
,
max_blk_size
)
#endif
#if COMPLEXCASE == 1
complex
(
kind
=
C_DATATYPE_KIND
)
::
bcast_buffer
(
nbw
,
max_blk_size
)
complex
(
kind
=
C_DATATYPE_KIND
)
::
bcast_buffer
(
nbw
,
max_blk_size
)
#endif
integer
(
kind
=
ik
),
intent
(
in
)
::
a_off
integer
(
kind
=
ik
),
intent
(
in
)
::
a_off
integer
(
kind
=
ik
),
intent
(
in
)
::
stripe_width
,
a_dim2
,
stripe_count
integer
(
kind
=
ik
),
intent
(
in
)
::
stripe_width
,
a_dim2
,
stripe_count
#ifndef WITH_OPENMP
integer
(
kind
=
ik
),
intent
(
in
)
::
last_stripe_width
integer
(
kind
=
ik
),
intent
(
in
)
::
last_stripe_width
#if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count)
real
(
kind
=
C_DATATYPE_KIND
),
pointer
::
a
(:,:,:)
! real(kind=C_DATATYPE_KIND)
:: a(stripe_width,a_dim2,stripe_count)
real
(
kind
=
C_DATATYPE_KIND
),
pointer
::
a
(:,:,:)
#endif
#if COMPLEXCASE == 1
! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count)
complex
(
kind
=
C_DATATYPE_KIND
),
pointer
::
a
(:,:,:)
! complex(kind=C_DATATYPE_KIND)
:: a(stripe_width,a_dim2,stripe_count)
complex
(
kind
=
C_DATATYPE_KIND
),
pointer
::
a
(:,:,:)
#endif
#else /* WITH_OPENMP */
integer
(
kind
=
ik
),
intent
(
in
)
::
max_threads
,
l_nev
,
thread_width
integer
(
kind
=
ik
),
intent
(
in
)
::
max_threads
,
l_nev
,
thread_width
#if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real
(
kind
=
C_DATATYPE_KIND
),
pointer
::
a
(:,:,:,:)
! real(kind=C_DATATYPE_KIND)
:: a(stripe_width,a_dim2,stripe_count,max_threads)
real
(
kind
=
C_DATATYPE_KIND
),
pointer
::
a
(:,:,:,:)
#endif
#if COMPLEXCASE == 1
! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
complex
(
kind
=
C_DATATYPE_KIND
),
pointer
::
a
(:,:,:,:)
! complex(kind=C_DATATYPE_KIND)
:: a(stripe_width,a_dim2,stripe_count,max_threads)
complex
(
kind
=
C_DATATYPE_KIND
),
pointer
::
a
(:,:,:,:)
#endif
#endif /* WITH_OPENMP */
integer
(
kind
=
ik
),
intent
(
in
)
::
kernel
integer
(
kind
=
ik
),
intent
(
in
)
::
kernel
integer
(
kind
=
c_intptr_t
)
::
a_dev
integer
(
kind
=
c_intptr_t
)
::
bcast_buffer_dev
integer
(
kind
=
c_intptr_t
)
::
a_dev
integer
(
kind
=
c_intptr_t
)
::
bcast_buffer_dev
#if REALCASE == 1
integer
(
kind
=
c_intptr_t
)
::
hh_dot_dev
! why not needed in complex case
integer
(
kind
=
c_intptr_t
)
::
hh_dot_dev
! why not needed in complex case
#endif
integer
(
kind
=
c_intptr_t
)
::
hh_tau_dev
integer
(
kind
=
c_intptr_t
)
::
dev_offset
,
dev_offset_1
,
dev_offset_2
integer
(
kind
=
c_intptr_t
)
::
hh_tau_dev
integer
(
kind
=
c_intptr_t
)
::
dev_offset
,
dev_offset_1
,
dev_offset_2
! Private variables in OMP regions (my_thread) should better be in the argument list!
integer
(
kind
=
ik
)
::
off
,
ncols
,
istripe
integer
(
kind
=
ik
)
::
off
,
ncols
,
istripe
#ifdef WITH_OPENMP
integer
(
kind
=
ik
)
::
my_thread
,
noff
integer
(
kind
=
ik
)
::
my_thread
,
noff
#endif
integer
(
kind
=
ik
)
::
j
,
nl
,
jj
,
jjj
,
n_times
integer
(
kind
=
ik
)
::
j
,
nl
,
jj
,
jjj
,
n_times
#if REALCASE == 1
real
(
kind
=
C_DATATYPE_KIND
)
::
w
(
nbw
,
6
)
real
(
kind
=
C_DATATYPE_KIND
)
::
w
(
nbw
,
6
)
#endif
#if COMPLEXCASE == 1
complex
(
kind
=
C_DATATYPE_KIND
)
::
w
(
nbw
,
2
)
complex
(
kind
=
C_DATATYPE_KIND
)
::
w
(
nbw
,
2
)
#endif
real
(
kind
=
c_double
)
::
ttt
! MPI_WTIME always needs double
real
(
kind
=
c_double
)
::
ttt
! MPI_WTIME always needs double
if
(
wantDebug
)
then
if
(
useGPU
.and.
&
#if REALCASE == 1
if
(
kernel
.eq.
ELPA_2STAGE_REAL_GPU
)
then
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if
(
ncols
<
1
)
return
(
kernel
.ne.
ELPA_2STAGE_REAL_GPU
))
then
#endif
#if COMPLEXCASE == 1
(
kernel
.ne.
ELPA_2STAGE_COMPLEX_GPU
))
then
#endif
print
*
,
"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
stop
endif
endif
#if REALCASE == 1
if
(
kernel
.eq.
ELPA_2STAGE_REAL_GPU
)
then
#endif
#if COMPLEXCASE == 1
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_GPU
)
then
#endif
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if
(
ncols
<
1
)
return
if
(
ncols
<
1
)
then
if
(
wantDebug
)
then
print
*
,
"Returning early from compute_hh_trafo"
endif
return
endif
endif
#endif
call
obj
%
timer
%
start
(
"compute_hh_trafo_&
&MATH_DATATYPE&
...
...
@@ -211,16 +228,16 @@
#if REALCASE == 1
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
print *,"
compute_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
GPU
OPENMP
:
not
yet
implemented
"
&
MATH_DATATYPE
&
&
_
GPU
OPENMP
:
not
yet
implemented
"
stop 1
endif
#endif
#if COMPLEXCASE == 1
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
print *,"
compute_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
GPU
OPENMP
:
not
yet
implemented
"
&
MATH_DATATYPE
&
&
_
GPU
OPENMP
:
not
yet
implemented
"
stop 1
endif
#endif
...
...
@@ -248,47 +265,60 @@
#if REALCASE == 1
! GPU kernel real
if
(
kernel
.eq.
ELPA_2STAGE_REAL_GPU
)
then
if
(
wantDebug
)
then
call
obj
%
timer
%
start
(
"compute_hh_trafo: GPU"
)
endif
dev_offset
=
(
0
+
(
a_off
*
stripe_width
)
+
(
(
istripe
-
1
)
*
stripe_width
*
a_dim2
))
*
size_of_
&
&
PRECISION
&
&
_
&
&
MATH_DATATYPE
&
_
&
&
MATH_DATATYPE
call
launch_compute_hh_trafo_gpu_kernel_
&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
&
(
a_dev
+
dev_offset
,
bcast_buffer_dev
,
hh_dot_dev
,
hh_tau_dev
,
nl
,
nbw
,
stripe_width
,
off
,
ncols
)
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
&
(
a_dev
+
dev_offset
,
bcast_buffer_dev
,
hh_dot_dev
,
hh_tau_dev
,
nl
,
nbw
,
stripe_width
,
off
,
ncols
)
#endif /* REALCASE */
#if COMPLEXCASE == 1
! GPU kernel complex
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_GPU
)
then
if
(
wantDebug
)
then
call
obj
%
timer
%
start
(
"compute_hh_trafo: GPU"
)
endif
dev_offset
=
(
0
+
(
(
a_off
+
off
-1
)
*
stripe_width
)
+
(
(
istripe
-
1
)
*
stripe_width
*
a_dim2
))
*
size_of_
&
&
PRECISION
&
&
_
&
&
MATH_DATATYPE
&
_
&
&
MATH_DATATYPE
dev_offset_1
=
(
0
+
(
off
-1
)
*
nbw
)
*
size_of_
&
&
PRECISION
&
&
_
&
&
MATH_DATATYPE
&
_
&
&
MATH_DATATYPE
dev_offset_2
=
(
off
-1
)
*
size_of_
&
&
PRECISION
&
&
_
&
&
MATH_DATATYPE
&
_
&
&
MATH_DATATYPE
call
launch_compute_hh_trafo_gpu_kernel_
&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
&
(
a_dev
+
dev_offset
,
bcast_buffer_dev
+
dev_offset_1
,
&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
&
(
a_dev
+
dev_offset
,
bcast_buffer_dev
+
dev_offset_1
,
&
hh_tau_dev
+
dev_offset_2
,
nl
,
nbw
,
stripe_width
,
off
,
ncols
)
#endif /* COMPLEXCASE */
if
(
wantDebug
)
then
call
obj
%
timer
%
stop
(
"compute_hh_trafo: GPU"
)
endif
else
! not CUDA kernel
if
(
wantDebug
)
then
call
obj
%
timer
%
start
(
"compute_hh_trafo: CPU"
)
endif
#if REALCASE == 1
#ifndef WITH_FIXED_REAL_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_REAL_AVX_BLOCK2
.or.
&
...
...
@@ -321,17 +351,17 @@
#ifdef USE_ASSUMED_SIZE
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
generic_
&
&
PRECISION
&
&
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
&
MATH_DATATYPE
&
&
_
generic_
&
&
PRECISION
&
&
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
generic_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
j
+
off
+
a_off
-1
:
j
+
off
+
a_off
+
nbw
-1
,
istripe
,
my_thread
),
w
(
1
:
nbw
,
1
:
6
),
&
&
MATH_DATATYPE
&
&
_
generic_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
j
+
off
+
a_off
-1
:
j
+
off
+
a_off
+
nbw
-1
,
istripe
,
my_thread
),
w
(
1
:
nbw
,
1
:
6
),
&
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
...
...
@@ -339,17 +369,17 @@
#ifdef USE_ASSUMED_SIZE
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
generic_
&
&
PRECISION
&
&
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
&
MATH_DATATYPE
&
&
_
generic_
&
&
PRECISION
&
&
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
generic_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
j
+
off
+
a_off
-1
:
j
+
off
+
a_off
+
nbw
-1
,
istripe
),
w
(
1
:
nbw
,
1
:
6
),
nbw
,
nl
,
stripe_width
,
nbw
)
&
MATH_DATATYPE
&
&
_
generic_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
j
+
off
+
a_off
-1
:
j
+
off
+
a_off
+
nbw
-1
,
istripe
),
w
(
1
:
nbw
,
1
:
6
),
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
#endif /* WITH_OPENMP */
...
...
@@ -376,32 +406,34 @@
#ifdef USE_ASSUMED_SIZE
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
generic_
&
&
PRECISION
&
&
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
&
MATH_DATATYPE
&
&
_
generic_
&
&
PRECISION
&
&
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#else
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
generic_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
j
+
off
+
a_off
:
j
+
off
+
a_off
+
nbw
-1
,
istripe
,
my_thread
),
bcast_buffer
(
1
:
nbw
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
&
MATH_DATATYPE
&
&
_
generic_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
j
+
off
+
a_off
:
j
+
off
+
a_off
+
nbw
-1
,
istripe
,
my_thread
),
&
bcast_buffer
(
1
:
nbw
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#endif
#else /* WITH_OPENMP */
#ifdef USE_ASSUMED_SIZE
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
generic_
&
&
PRECISION
&
&
(
a
(
1
,
j
+
off
+
a_off
,
istripe
),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
&
MATH_DATATYPE
&
&
_
generic_
&
&
PRECISION
&
&
(
a
(
1
,
j
+
off
+
a_off
,
istripe
),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#else
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
generic_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
j
+
off
+
a_off
:
j
+
off
+
a_off
+
nbw
-1
,
istripe
),
bcast_buffer
(
1
:
nbw
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
&
MATH_DATATYPE
&
&
_
generic_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
j
+
off
+
a_off
:
j
+
off
+
a_off
+
nbw
-1
,
istripe
),
bcast_buffer
(
1
:
nbw
,
j
+
off
),
&
nbw
,
nl
,
stripe_width
)
#endif
#endif /* WITH_OPENMP */
...
...
@@ -428,16 +460,16 @@
#ifdef USE_ASSUMED_SIZE
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
generic_simple_
&
&
PRECISION
&
&
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
&
MATH_DATATYPE
&
&
_
generic_simple_
&
&
PRECISION
&
&
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
generic_simple_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
j
+
off
+
a_off
-1
:
j
+
off
+
a_off
-1
+
nbw
,
istripe
,
my_thread
),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
&
MATH_DATATYPE
&
&
_
generic_simple_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
j
+
off
+
a_off
-1
:
j
+
off
+
a_off
-1
+
nbw
,
istripe
,
my_thread
),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
...
...
@@ -445,16 +477,16 @@
#ifdef USE_ASSUMED_SIZE
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
generic_simple_
&
&
PRECISION
&
&
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
&
MATH_DATATYPE
&
&
_
generic_simple_
&
&
PRECISION
&
&
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
generic_simple_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
j
+
off
+
a_off
-1
:
j
+
off
+
a_off
-1
+
nbw
,
istripe
),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
&
MATH_DATATYPE
&
&
_
generic_simple_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
j
+
off
+
a_off
-1
:
j
+
off
+
a_off
-1
+
nbw
,
istripe
),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
...
...
@@ -480,32 +512,34 @@
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
generic_simple_
&
&
PRECISION
&
&
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
&
MATH_DATATYPE
&
&
_
generic_simple_
&
&
PRECISION
&
&
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#else
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
generic_simple_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
j
+
off
+
a_off
:
j
+
off
+
a_off
+
nbw
-1
,
istripe
,
my_thread
),
bcast_buffer
(
1
:
nbw
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
&
MATH_DATATYPE
&
&
_
generic_simple_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
j
+
off
+
a_off
:
j
+
off
+
a_off
+
nbw
-1
,
istripe
,
my_thread
),
bcast_buffer
(
1
:
nbw
,
j
+
off
),
&
nbw
,
nl
,
stripe_width
)
#endif
#else /* WITH_OPENMP */
#ifdef USE_ASSUMED_SIZE
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
generic_simple_
&
&
PRECISION
&
&
(
a
(
1
,
j
+
off
+
a_off
,
istripe
),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
&
MATH_DATATYPE
&
&
_
generic_simple_
&
&
PRECISION
&
&
(
a
(
1
,
j
+
off
+
a_off
,
istripe
),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#else
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
generic_simple_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
j
+
off
+
a_off
:
j
+
off
+
a_off
+
nbw
-1
,
istripe
),
bcast_buffer
(
1
:
nbw
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
&
MATH_DATATYPE
&
&
_
generic_simple_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
j
+
off
+
a_off
:
j
+
off
+
a_off
+
nbw
-1
,
istripe
),
bcast_buffer
(
1
:
nbw
,
j
+
off
),
&
nbw
,
nl
,
stripe_width
)
#endif
#endif /* WITH_OPENMP */
...
...
@@ -530,17 +564,17 @@
#ifdef WITH_OPENMP
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
&
_
sse_assembly
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
&
_
&
&
PRECISION
&
&
_
sse_assembly
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
&
_
sse_assembly
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
&
_
sse_assembly
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
#ifndef WITH_FIXED_REAL_KERNEL
...
...
@@ -561,18 +595,18 @@
do
j
=
ncols
,
1
,
-1
#ifdef WITH_OPENMP
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
&
_
sse_assembly
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
&
_
sse_assembly
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#else
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
&
_
sse_assembly
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
&
_
sse_assembly
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#endif
enddo
#ifndef WITH_FIXED_COMPLEX_KERNEL
...
...
@@ -598,16 +632,16 @@
do
j
=
ncols
,
1
,
-1
#ifdef WITH_OPENMP
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sse_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
&
MATH_DATATYPE
&
&
_
sse_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#else
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sse_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
&
MATH_DATATYPE
&
&
_
sse_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
...
...
@@ -637,16 +671,16 @@
do
j
=
ncols
,
1
,
-1
#ifdef WITH_OPENMP
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
avx_avx2_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
&
MATH_DATATYPE
&
&
_
avx_avx2_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#else
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
avx_avx2_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
&
MATH_DATATYPE
&
&
_
avx_avx2_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */
...
...
@@ -675,16 +709,16 @@
do
j
=
ncols
,
1
,
-1
#ifdef WITH_OPENMP
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
avx512_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
&
MATH_DATATYPE
&
&
_
avx512_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#else
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
avx512_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
&
MATH_DATATYPE
&
&
_
avx512_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) ) */
...
...
@@ -710,16 +744,16 @@
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sse_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
&
MATH_DATATYPE
&
&
_
sse_2hv_
&
&
PRECISION
&