Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
0f93fec7
Commit
0f93fec7
authored
Jul 13, 2012
by
Alexander Heinecke
Browse files
finalized integration of x86 intrinisic kernels
parent
870ac371
Changes
13
Hide whitespace changes
Inline
Side-by-side
ELPA_development_version/src/elpa2.f90
View file @
0f93fec7
...
...
@@ -2027,50 +2027,30 @@ contains
if
(
nl
<=
0
)
return
endif
!FORTRAN CODE
!FORTRAN CODE
/ X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
do
j
=
ncols
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
call
double_hh_trafo
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
call
double_hh_trafo
_2hv
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
enddo
if
(
j
==
1
)
call
single_hh_trafo
(
a
(
1
,
1
+
off
+
a_off
,
istripe
,
my_thread
),
bcast_buffer
(
1
,
off
+1
),
nbw
,
nl
,
stripe_width
)
!INTRINSIC CODE, USING 2 HOUSEHOLDER VECTORS
!do j = ncols, 2, -2
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! if (mod(nl,24) == 0) then
! call double_hh_trafo_2hv_fast(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_2hv(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
!enddo
!if(j==1) call single_hh_trafo(a(1,1+off+a_off,istripe,my_thread),bcast_buffer(1,off+1), nbw, nl, stripe_width)
!INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
!do j = ncols, 4, -4
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! w(:,3) = bcast_buffer(1:nbw,j+off-2)
! w(:,4) = bcast_buffer(1:nbw,j+off-3)
! if (mod(nl,12) == 0) then
! call double_hh_trafo_4hv_fast(a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_4hv(a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
! call double_hh_trafo_4hv(a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!enddo
!do jj = j, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
! if (mod(nl,24) == 0) then
! call double_hh_trafo_2hv_fast(a(1,jj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_2hv(a(1,jj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
! call double_hh_trafo_2hv(a(1,jj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!enddo
!if(jj==1) call single_hh_trafo(a(1,1+off+a_off,istripe,my_thread),bcast_buffer(1,off+1), nbw, nl, stripe_width)
!INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
!
X86
INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
!do j = ncols, 6, -6
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
...
...
@@ -2078,31 +2058,19 @@ contains
! w(:,4) = bcast_buffer(1:nbw,j+off-3)
! w(:,5) = bcast_buffer(1:nbw,j+off-4)
! w(:,6) = bcast_buffer(1:nbw,j+off-5)
! if (mod(nl,8) == 0) then
! call double_hh_trafo_6hv_fast(a(1,j+off+a_off-5,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_6hv(a(1,j+off+a_off-5,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
! call double_hh_trafo_6hv(a(1,j+off+a_off-5,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!enddo
!do jj = j, 4, -4
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
! w(:,3) = bcast_buffer(1:nbw,jj+off-2)
! w(:,4) = bcast_buffer(1:nbw,jj+off-3)
! if (mod(nl,12) == 0) then
! call double_hh_trafo_4hv_fast(a(1,jj+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
! call double_hh_trafo_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!enddo
!do jjj = jj, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jjj+off)
! w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
! if (mod(nl,24) == 0) then
! call double_hh_trafo_2hv_fast(a(1,jjj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! else
! call double_hh_trafo_2hv(a(1,jjj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
! endif
! call double_hh_trafo_2hv(a(1,jjj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!enddo
!if(jjj==1) call single_hh_trafo(a(1,1+off+a_off,istripe,my_thread),bcast_buffer(1,off+1), nbw, nl, stripe_width)
...
...
ELPA_development_version/src/README_elpa2_kernels.txt
→
ELPA_development_version/src/
elpa2_kernels/
README_elpa2_kernels.txt
View file @
0f93fec7
...
...
@@ -28,18 +28,18 @@ Currently we offer the following alternatives for the ELPA2 kernels:
* elpa2_kernels_bg.f90 - Fortran code enhanced with assembler calls
for the IBM BlueGene/P
* elpa2_kernels_
asm_x86_64.s
- Optimized
assembler
code for x86_64
* elpa2_
tum_
kernels_
*.c
- Optimized
intrinisic
code for x86_64
systems (i.e. Intel/AMD architecture)
using SSE2/SSE3 operations.
(Use
GNU assembler for assembling
!)
(Use
gcc for compiling as Intel compiler generates slower code
!)
So which version should be used?
================================
* On x86_64 systems (i.e. almost all Intel/AMD systems) or on the IBM BlueGene/P
you should get the optimal performance using the optimized assembler versions
in elpa2_kernels_
asm_x86_64.s
or elpa2_kernels_bg.f90 respectively.
you should get the optimal performance using the optimized
intrinsics/
assembler versions
in elpa2_
tum_
kernels_
*.c
or elpa2_kernels_bg.f90 respectively.
* If you don't compile for one of these systems or you don't like to use assembler
for any reason, it is likely that you are best off using elpa2_kernels.f90.
...
...
ELPA_development_version/src/elpa2_kernels/elpa2_kernels_complex.f90
0 → 100644
View file @
0f93fec7
! --------------------------------------------------------------------------------------------------
!
! This file contains the compute intensive kernels for the Householder transformations.
! It should be compiled with the highest possible optimization level.
!
! On Intel use -O3 -xSSE4.2 (or the SSE level fitting to your CPU)
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! --------------------------------------------------------------------------------------------------
subroutine
single_hh_trafo_complex
(
q
,
hh
,
nb
,
nq
,
ldq
)
implicit
none
integer
,
intent
(
in
)
::
nb
,
nq
,
ldq
complex
*
16
,
intent
(
inout
)
::
q
(
ldq
,
*
)
complex
*
16
,
intent
(
in
)
::
hh
(
*
)
integer
i
! Safety only:
if
(
mod
(
ldq
,
4
)
/
=
0
)
STOP
'double_hh_trafo: ldq not divisible by 4!'
! Do the Householder transformations
! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
do
i
=
1
,
nq
-8
,
12
call
hh_trafo_complex_kernel_12
(
q
(
i
,
1
),
hh
,
nb
,
ldq
)
enddo
! i > nq-8 now, i.e. at most 8 rows remain
if
(
nq
-
i
+1
>
4
)
then
call
hh_trafo_complex_kernel_8
(
q
(
i
,
1
),
hh
,
nb
,
ldq
)
else
if
(
nq
-
i
+1
>
0
)
then
call
hh_trafo_complex_kernel_4
(
q
(
i
,
1
),
hh
,
nb
,
ldq
)
endif
end
! --------------------------------------------------------------------------------------------------
subroutine
hh_trafo_complex_kernel_12
(
q
,
hh
,
nb
,
ldq
)
implicit
none
integer
,
intent
(
in
)
::
nb
,
ldq
complex
*
16
,
intent
(
inout
)
::
q
(
ldq
,
*
)
complex
*
16
,
intent
(
in
)
::
hh
(
*
)
complex
*
16
x1
,
x2
,
x3
,
x4
,
x5
,
x6
,
x7
,
x8
,
x9
,
xa
,
xb
,
xc
complex
*
16
h1
,
tau1
integer
i
x1
=
q
(
1
,
1
)
x2
=
q
(
2
,
1
)
x3
=
q
(
3
,
1
)
x4
=
q
(
4
,
1
)
x5
=
q
(
5
,
1
)
x6
=
q
(
6
,
1
)
x7
=
q
(
7
,
1
)
x8
=
q
(
8
,
1
)
x9
=
q
(
9
,
1
)
xa
=
q
(
10
,
1
)
xb
=
q
(
11
,
1
)
xc
=
q
(
12
,
1
)
!DEC$ VECTOR ALIGNED
do
i
=
2
,
nb
h1
=
conjg
(
hh
(
i
))
x1
=
x1
+
q
(
1
,
i
)
*
h1
x2
=
x2
+
q
(
2
,
i
)
*
h1
x3
=
x3
+
q
(
3
,
i
)
*
h1
x4
=
x4
+
q
(
4
,
i
)
*
h1
x5
=
x5
+
q
(
5
,
i
)
*
h1
x6
=
x6
+
q
(
6
,
i
)
*
h1
x7
=
x7
+
q
(
7
,
i
)
*
h1
x8
=
x8
+
q
(
8
,
i
)
*
h1
x9
=
x9
+
q
(
9
,
i
)
*
h1
xa
=
xa
+
q
(
10
,
i
)
*
h1
xb
=
xb
+
q
(
11
,
i
)
*
h1
xc
=
xc
+
q
(
12
,
i
)
*
h1
enddo
tau1
=
hh
(
1
)
h1
=
-
tau1
x1
=
x1
*
h1
x2
=
x2
*
h1
x3
=
x3
*
h1
x4
=
x4
*
h1
x5
=
x5
*
h1
x6
=
x6
*
h1
x7
=
x7
*
h1
x8
=
x8
*
h1
x9
=
x9
*
h1
xa
=
xa
*
h1
xb
=
xb
*
h1
xc
=
xc
*
h1
q
(
1
,
1
)
=
q
(
1
,
1
)
+
x1
q
(
2
,
1
)
=
q
(
2
,
1
)
+
x2
q
(
3
,
1
)
=
q
(
3
,
1
)
+
x3
q
(
4
,
1
)
=
q
(
4
,
1
)
+
x4
q
(
5
,
1
)
=
q
(
5
,
1
)
+
x5
q
(
6
,
1
)
=
q
(
6
,
1
)
+
x6
q
(
7
,
1
)
=
q
(
7
,
1
)
+
x7
q
(
8
,
1
)
=
q
(
8
,
1
)
+
x8
q
(
9
,
1
)
=
q
(
9
,
1
)
+
x9
q
(
10
,
1
)
=
q
(
10
,
1
)
+
xa
q
(
11
,
1
)
=
q
(
11
,
1
)
+
xb
q
(
12
,
1
)
=
q
(
12
,
1
)
+
xc
!DEC$ VECTOR ALIGNED
do
i
=
2
,
nb
h1
=
hh
(
i
)
q
(
1
,
i
)
=
q
(
1
,
i
)
+
x1
*
h1
q
(
2
,
i
)
=
q
(
2
,
i
)
+
x2
*
h1
q
(
3
,
i
)
=
q
(
3
,
i
)
+
x3
*
h1
q
(
4
,
i
)
=
q
(
4
,
i
)
+
x4
*
h1
q
(
5
,
i
)
=
q
(
5
,
i
)
+
x5
*
h1
q
(
6
,
i
)
=
q
(
6
,
i
)
+
x6
*
h1
q
(
7
,
i
)
=
q
(
7
,
i
)
+
x7
*
h1
q
(
8
,
i
)
=
q
(
8
,
i
)
+
x8
*
h1
q
(
9
,
i
)
=
q
(
9
,
i
)
+
x9
*
h1
q
(
10
,
i
)
=
q
(
10
,
i
)
+
xa
*
h1
q
(
11
,
i
)
=
q
(
11
,
i
)
+
xb
*
h1
q
(
12
,
i
)
=
q
(
12
,
i
)
+
xc
*
h1
enddo
end
! --------------------------------------------------------------------------------------------------
subroutine
hh_trafo_complex_kernel_8
(
q
,
hh
,
nb
,
ldq
)
implicit
none
integer
,
intent
(
in
)
::
nb
,
ldq
complex
*
16
,
intent
(
inout
)
::
q
(
ldq
,
*
)
complex
*
16
,
intent
(
in
)
::
hh
(
*
)
complex
*
16
x1
,
x2
,
x3
,
x4
,
x5
,
x6
,
x7
,
x8
complex
*
16
h1
,
tau1
integer
i
x1
=
q
(
1
,
1
)
x2
=
q
(
2
,
1
)
x3
=
q
(
3
,
1
)
x4
=
q
(
4
,
1
)
x5
=
q
(
5
,
1
)
x6
=
q
(
6
,
1
)
x7
=
q
(
7
,
1
)
x8
=
q
(
8
,
1
)
!DEC$ VECTOR ALIGNED
do
i
=
2
,
nb
h1
=
conjg
(
hh
(
i
))
x1
=
x1
+
q
(
1
,
i
)
*
h1
x2
=
x2
+
q
(
2
,
i
)
*
h1
x3
=
x3
+
q
(
3
,
i
)
*
h1
x4
=
x4
+
q
(
4
,
i
)
*
h1
x5
=
x5
+
q
(
5
,
i
)
*
h1
x6
=
x6
+
q
(
6
,
i
)
*
h1
x7
=
x7
+
q
(
7
,
i
)
*
h1
x8
=
x8
+
q
(
8
,
i
)
*
h1
enddo
tau1
=
hh
(
1
)
h1
=
-
tau1
x1
=
x1
*
h1
x2
=
x2
*
h1
x3
=
x3
*
h1
x4
=
x4
*
h1
x5
=
x5
*
h1
x6
=
x6
*
h1
x7
=
x7
*
h1
x8
=
x8
*
h1
q
(
1
,
1
)
=
q
(
1
,
1
)
+
x1
q
(
2
,
1
)
=
q
(
2
,
1
)
+
x2
q
(
3
,
1
)
=
q
(
3
,
1
)
+
x3
q
(
4
,
1
)
=
q
(
4
,
1
)
+
x4
q
(
5
,
1
)
=
q
(
5
,
1
)
+
x5
q
(
6
,
1
)
=
q
(
6
,
1
)
+
x6
q
(
7
,
1
)
=
q
(
7
,
1
)
+
x7
q
(
8
,
1
)
=
q
(
8
,
1
)
+
x8
!DEC$ VECTOR ALIGNED
do
i
=
2
,
nb
h1
=
hh
(
i
)
q
(
1
,
i
)
=
q
(
1
,
i
)
+
x1
*
h1
q
(
2
,
i
)
=
q
(
2
,
i
)
+
x2
*
h1
q
(
3
,
i
)
=
q
(
3
,
i
)
+
x3
*
h1
q
(
4
,
i
)
=
q
(
4
,
i
)
+
x4
*
h1
q
(
5
,
i
)
=
q
(
5
,
i
)
+
x5
*
h1
q
(
6
,
i
)
=
q
(
6
,
i
)
+
x6
*
h1
q
(
7
,
i
)
=
q
(
7
,
i
)
+
x7
*
h1
q
(
8
,
i
)
=
q
(
8
,
i
)
+
x8
*
h1
enddo
end
! --------------------------------------------------------------------------------------------------
subroutine
hh_trafo_complex_kernel_4
(
q
,
hh
,
nb
,
ldq
)
implicit
none
integer
,
intent
(
in
)
::
nb
,
ldq
complex
*
16
,
intent
(
inout
)
::
q
(
ldq
,
*
)
complex
*
16
,
intent
(
in
)
::
hh
(
*
)
complex
*
16
x1
,
x2
,
x3
,
x4
complex
*
16
h1
,
tau1
integer
i
x1
=
q
(
1
,
1
)
x2
=
q
(
2
,
1
)
x3
=
q
(
3
,
1
)
x4
=
q
(
4
,
1
)
!DEC$ VECTOR ALIGNED
do
i
=
2
,
nb
h1
=
conjg
(
hh
(
i
))
x1
=
x1
+
q
(
1
,
i
)
*
h1
x2
=
x2
+
q
(
2
,
i
)
*
h1
x3
=
x3
+
q
(
3
,
i
)
*
h1
x4
=
x4
+
q
(
4
,
i
)
*
h1
enddo
tau1
=
hh
(
1
)
h1
=
-
tau1
x1
=
x1
*
h1
x2
=
x2
*
h1
x3
=
x3
*
h1
x4
=
x4
*
h1
q
(
1
,
1
)
=
q
(
1
,
1
)
+
x1
q
(
2
,
1
)
=
q
(
2
,
1
)
+
x2
q
(
3
,
1
)
=
q
(
3
,
1
)
+
x3
q
(
4
,
1
)
=
q
(
4
,
1
)
+
x4
!DEC$ VECTOR ALIGNED
do
i
=
2
,
nb
h1
=
hh
(
i
)
q
(
1
,
i
)
=
q
(
1
,
i
)
+
x1
*
h1
q
(
2
,
i
)
=
q
(
2
,
i
)
+
x2
*
h1
q
(
3
,
i
)
=
q
(
3
,
i
)
+
x3
*
h1
q
(
4
,
i
)
=
q
(
4
,
i
)
+
x4
*
h1
enddo
end
! --------------------------------------------------------------------------------------------------
ELPA_development_version/src/elpa2_kernels/elpa2_kernels_complex_bg.f90
0 → 100644
View file @
0f93fec7
! --------------------------------------------------------------------------------------------------
!
! This file contains the compute intensive kernels for the Householder transformations.
!
! *** Special IBM BlueGene/P version with BlueGene assembler instructions in Fortran ***
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! --------------------------------------------------------------------------------------------------
subroutine
single_hh_trafo_complex
(
q
,
hh
,
nb
,
nq
,
ldq
)
implicit
none
integer
,
intent
(
in
)
::
nb
,
nq
,
ldq
complex
*
16
,
intent
(
inout
)
::
q
(
ldq
,
*
)
complex
*
16
,
intent
(
in
)
::
hh
(
*
)
integer
i
! Safety only:
if
(
mod
(
ldq
,
4
)
/
=
0
)
STOP
'double_hh_trafo: ldq not divisible by 4!'
! Do the Householder transformations
! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
do
i
=
1
,
nq
-8
,
12
call
hh_trafo_complex_kernel_12
(
q
(
i
,
1
),
hh
,
nb
,
ldq
)
enddo
! i > nq-8 now, i.e. at most 8 rows remain
if
(
nq
-
i
+1
>
4
)
then
call
hh_trafo_complex_kernel_8
(
q
(
i
,
1
),
hh
,
nb
,
ldq
)
else
if
(
nq
-
i
+1
>
0
)
then
call
hh_trafo_complex_kernel_4
(
q
(
i
,
1
),
hh
,
nb
,
ldq
)
endif
end
! --------------------------------------------------------------------------------------------------
subroutine
hh_trafo_complex_kernel_12
(
q
,
hh
,
nb
,
ldq
)
implicit
none
integer
,
intent
(
in
)
::
nb
,
ldq
complex
*
16
,
intent
(
inout
)
::
q
(
ldq
,
*
)
complex
*
16
,
intent
(
in
)
::
hh
(
*
)
complex
*
16
x1
,
x2
,
x3
,
x4
,
x5
,
x6
,
x7
,
x8
,
x9
,
xa
,
xb
,
xc
complex
*
16
h1
,
tau1
integer
i
x1
=
q
(
1
,
1
)
x2
=
q
(
2
,
1
)
x3
=
q
(
3
,
1
)
x4
=
q
(
4
,
1
)
x5
=
q
(
5
,
1
)
x6
=
q
(
6
,
1
)
x7
=
q
(
7
,
1
)
x8
=
q
(
8
,
1
)
x9
=
q
(
9
,
1
)
xa
=
q
(
10
,
1
)
xb
=
q
(
11
,
1
)
xc
=
q
(
12
,
1
)
!DEC$ VECTOR ALIGNED
do
i
=
2
,
nb
h1
=
conjg
(
hh
(
i
))
x1
=
x1
+
q
(
1
,
i
)
*
h1
x2
=
x2
+
q
(
2
,
i
)
*
h1
x3
=
x3
+
q
(
3
,
i
)
*
h1
x4
=
x4
+
q
(
4
,
i
)
*
h1
x5
=
x5
+
q
(
5
,
i
)
*
h1
x6
=
x6
+
q
(
6
,
i
)
*
h1
x7
=
x7
+
q
(
7
,
i
)
*
h1
x8
=
x8
+
q
(
8
,
i
)
*
h1
x9
=
x9
+
q
(
9
,
i
)
*
h1
xa
=
xa
+
q
(
10
,
i
)
*
h1
xb
=
xb
+
q
(
11
,
i
)
*
h1
xc
=
xc
+
q
(
12
,
i
)
*
h1
enddo
tau1
=
hh
(
1
)
h1
=
-
tau1
x1
=
x1
*
h1
x2
=
x2
*
h1
x3
=
x3
*
h1
x4
=
x4
*
h1
x5
=
x5
*
h1
x6
=
x6
*
h1
x7
=
x7
*
h1
x8
=
x8
*
h1
x9
=
x9
*
h1
xa
=
xa
*
h1
xb
=
xb
*
h1
xc
=
xc
*
h1
q
(
1
,
1
)
=
q
(
1
,
1
)
+
x1
q
(
2
,
1
)
=
q
(
2
,
1
)
+
x2
q
(
3
,
1
)
=
q
(
3
,
1
)
+
x3
q
(
4
,
1
)
=
q
(
4
,
1
)
+
x4
q
(
5
,
1
)
=
q
(
5
,
1
)
+
x5
q
(
6
,
1
)
=
q
(
6
,
1
)
+
x6
q
(
7
,
1
)
=
q
(
7
,
1
)
+
x7
q
(
8
,
1
)
=
q
(
8
,
1
)
+
x8
q
(
9
,
1
)
=
q
(
9
,
1
)
+
x9
q
(
10
,
1
)
=
q
(
10
,
1
)
+
xa
q
(
11
,
1
)
=
q
(
11
,
1
)
+
xb
q
(
12
,
1
)
=
q
(
12
,
1
)
+
xc
!DEC$ VECTOR ALIGNED
do
i
=
2
,
nb
h1
=
hh
(
i
)
q
(
1
,
i
)
=
q
(
1
,
i
)
+
x1
*
h1
q
(
2
,
i
)
=
q
(
2
,
i
)
+
x2
*
h1
q
(
3
,
i
)
=
q
(
3
,
i
)
+
x3
*
h1
q
(
4
,
i
)
=
q
(
4
,
i
)
+
x4
*
h1
q
(
5
,
i
)
=
q
(
5
,
i
)
+
x5
*
h1
q
(
6
,
i
)
=
q
(
6
,
i
)
+
x6
*
h1
q
(
7
,
i
)
=
q
(
7
,
i
)
+
x7
*
h1
q
(
8
,
i
)
=
q
(
8
,
i
)
+
x8
*
h1
q
(
9
,
i
)
=
q
(
9
,
i
)
+
x9
*
h1
q
(
10
,
i
)
=
q
(
10
,
i
)
+
xa
*
h1
q
(
11
,
i
)
=
q
(
11
,
i
)
+
xb
*
h1
q
(
12
,
i
)
=
q
(
12
,
i
)
+
xc
*
h1
enddo
end
! --------------------------------------------------------------------------------------------------
subroutine
hh_trafo_complex_kernel_8
(
q
,
hh
,
nb
,
ldq
)
implicit
none
integer
,
intent
(
in
)
::
nb
,
ldq
complex
*
16
,
intent
(
inout
)
::
q
(
ldq
,
*
)
complex
*
16
,
intent
(
in
)
::
hh
(
*
)
complex
*
16
x1
,
x2
,
x3
,
x4
,
x5
,
x6
,
x7
,
x8
complex
*
16
h1
,
tau1
integer
i
x1
=
q
(
1
,
1
)
x2
=
q
(
2
,
1
)
x3
=
q
(
3
,
1
)
x4
=
q
(
4
,
1
)
x5
=
q
(
5
,
1
)
x6
=
q
(
6
,
1
)
x7
=
q
(
7
,
1
)
x8
=
q
(
8
,
1
)
!DEC$ VECTOR ALIGNED
do
i
=
2
,
nb
h1
=
conjg
(
hh
(
i
))
x1
=
x1
+
q
(
1
,
i
)
*
h1
x2
=
x2
+
q
(
2
,
i
)
*
h1
x3
=
x3
+
q
(
3
,
i
)
*
h1
x4
=
x4
+
q
(
4
,
i
)
*
h1
x5
=
x5
+
q
(
5
,
i
)
*
h1