Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
24c49dc4
Commit
24c49dc4
authored
Jan 10, 2013
by
Thomas Auckenthaler
Browse files
Merge branch 'master' of elpa-lib.fhi-berlin.mpg.de:/pub/elpa_lib
parents
737e8df5
952b8091
Changes
27
Expand all
Hide whitespace changes
Inline
Side-by-side
ELPA_2011.12.Intrinsics/src/elpa2.f90
View file @
24c49dc4
...
@@ -1703,7 +1703,7 @@ contains
...
@@ -1703,7 +1703,7 @@ contains
do
j
=
ncols
,
2
,
-2
do
j
=
ncols
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
call
double_hh_trafo
_2hv
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
call
double_hh_trafo
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
enddo
enddo
if
(
j
==
1
)
call
single_hh_trafo
(
a
(
1
,
1
+
off
+
a_off
,
istripe
),
bcast_buffer
(
1
,
off
+1
),
nbw
,
nl
,
stripe_width
)
if
(
j
==
1
)
call
single_hh_trafo
(
a
(
1
,
1
+
off
+
a_off
,
istripe
),
bcast_buffer
(
1
,
off
+1
),
nbw
,
nl
,
stripe_width
)
...
@@ -1713,12 +1713,12 @@ contains
...
@@ -1713,12 +1713,12 @@ contains
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! w(:,3) = bcast_buffer(1:nbw,j+off-2)
! w(:,3) = bcast_buffer(1:nbw,j+off-2)
! w(:,4) = bcast_buffer(1:nbw,j+off-3)
! w(:,4) = bcast_buffer(1:nbw,j+off-3)
! call
double
_hh_trafo
_4hv
(a(1,j+off+a_off-3,istripe), w, nbw, nl, stripe_width, nbw)
! call
quad
_hh_trafo(a(1,j+off+a_off-3,istripe), w, nbw, nl, stripe_width, nbw)
!enddo
!enddo
!do jj = j, 2, -2
!do jj = j, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
! call double_hh_trafo
_2hv
(a(1,jj+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
! call double_hh_trafo(a(1,jj+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
!enddo
!enddo
!if(jj==1) call single_hh_trafo(a(1,1+off+a_off,istripe),bcast_buffer(1,off+1), nbw, nl, stripe_width)
!if(jj==1) call single_hh_trafo(a(1,1+off+a_off,istripe),bcast_buffer(1,off+1), nbw, nl, stripe_width)
...
@@ -1730,19 +1730,19 @@ contains
...
@@ -1730,19 +1730,19 @@ contains
! w(:,4) = bcast_buffer(1:nbw,j+off-3)
! w(:,4) = bcast_buffer(1:nbw,j+off-3)
! w(:,5) = bcast_buffer(1:nbw,j+off-4)
! w(:,5) = bcast_buffer(1:nbw,j+off-4)
! w(:,6) = bcast_buffer(1:nbw,j+off-5)
! w(:,6) = bcast_buffer(1:nbw,j+off-5)
! call
double
_hh_trafo
_6hv
(a(1,j+off+a_off-5,istripe), w, nbw, nl, stripe_width, nbw)
! call
hexa
_hh_trafo(a(1,j+off+a_off-5,istripe), w, nbw, nl, stripe_width, nbw)
!enddo
!enddo
!do jj = j, 4, -4
!do jj = j, 4, -4
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
! w(:,3) = bcast_buffer(1:nbw,jj+off-2)
! w(:,3) = bcast_buffer(1:nbw,jj+off-2)
! w(:,4) = bcast_buffer(1:nbw,jj+off-3)
! w(:,4) = bcast_buffer(1:nbw,jj+off-3)
! call
double
_hh_trafo
_4hv
(a(1,jj+off+a_off-3,istripe), w, nbw, nl, stripe_width, nbw)
! call
quad
_hh_trafo(a(1,jj+off+a_off-3,istripe), w, nbw, nl, stripe_width, nbw)
!enddo
!enddo
!do jjj = jj, 2, -2
!do jjj = jj, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jjj+off)
! w(:,1) = bcast_buffer(1:nbw,jjj+off)
! w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
! w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
! call double_hh_trafo
_2hv
(a(1,jjj+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
! call double_hh_trafo(a(1,jjj+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
!enddo
!enddo
!if(jjj==1) call single_hh_trafo(a(1,1+off+a_off,istripe),bcast_buffer(1,off+1), nbw, nl, stripe_width)
!if(jjj==1) call single_hh_trafo(a(1,1+off+a_off,istripe),bcast_buffer(1,off+1), nbw, nl, stripe_width)
...
@@ -2787,6 +2787,7 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r
...
@@ -2787,6 +2787,7 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r
a_dim2
=
max_blk_size
+
nbw
a_dim2
=
max_blk_size
+
nbw
!DEC$ ATTRIBUTES ALIGN: 64:: a
allocate
(
a
(
stripe_width
,
a_dim2
,
stripe_count
))
allocate
(
a
(
stripe_width
,
a_dim2
,
stripe_count
))
a
(:,:,:)
=
0
a
(:,:,:)
=
0
...
@@ -3197,6 +3198,25 @@ contains
...
@@ -3197,6 +3198,25 @@ contains
subroutine
compute_hh_trafo
(
off
,
ncols
,
istripe
)
subroutine
compute_hh_trafo
(
off
,
ncols
,
istripe
)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! Currently (on Sandy Bridge), single is faster than double
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! integer off, ncols, istripe, j, nl, jj
! real*8 ttt
! complex*16 w(nbw,2)
!
! ttt = mpi_wtime()
! nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
! do j = ncols, 2, -2
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! call double_hh_trafo_complex(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
! enddo
! if(j==1) call single_hh_trafo_complex(a(1,1+off+a_off,istripe),bcast_buffer(1,off+1), nbw, nl, stripe_width)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! Currently (on Sandy Bridge), single is faster than double
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
integer
off
,
ncols
,
istripe
,
j
,
nl
,
jj
integer
off
,
ncols
,
istripe
,
j
,
nl
,
jj
real
*
8
ttt
real
*
8
ttt
...
@@ -3205,6 +3225,7 @@ contains
...
@@ -3205,6 +3225,7 @@ contains
do
j
=
ncols
,
1
,
-1
do
j
=
ncols
,
1
,
-1
call
single_hh_trafo_complex
(
a
(
1
,
j
+
off
+
a_off
,
istripe
),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
call
single_hh_trafo_complex
(
a
(
1
,
j
+
off
+
a_off
,
istripe
),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
enddo
enddo
kernel_flops
=
kernel_flops
+
4
*
4
*
int
(
nl
,
8
)
*
int
(
ncols
,
8
)
*
int
(
nbw
,
8
)
kernel_flops
=
kernel_flops
+
4
*
4
*
int
(
nl
,
8
)
*
int
(
ncols
,
8
)
*
int
(
nbw
,
8
)
kernel_time
=
kernel_time
+
mpi_wtime
()
-
ttt
kernel_time
=
kernel_time
+
mpi_wtime
()
-
ttt
...
...
ELPA_2011.12.Intrinsics/src/elpa2_kernels/elpa2_kernels_complex.f90
View file @
24c49dc4
...
@@ -46,6 +46,51 @@ end
...
@@ -46,6 +46,51 @@ end
! --------------------------------------------------------------------------------------------------
! --------------------------------------------------------------------------------------------------
subroutine
double_hh_trafo_complex
(
q
,
hh
,
nb
,
nq
,
ldq
,
ldh
)
implicit
none
integer
,
intent
(
in
)
::
nb
,
nq
,
ldq
,
ldh
complex
*
16
,
intent
(
inout
)
::
q
(
ldq
,
*
)
complex
*
16
,
intent
(
in
)
::
hh
(
ldh
,
*
)
complex
*
16
s
integer
i
! Safety only:
if
(
mod
(
ldq
,
4
)
/
=
0
)
STOP
'double_hh_trafo: ldq not divisible by 4!'
! Calculate dot product of the two Householder vectors
s
=
conjg
(
hh
(
2
,
2
)
*
1
)
do
i
=
3
,
nb
s
=
s
+
(
conjg
(
hh
(
i
,
2
))
*
hh
(
i
-1
,
1
))
enddo
! Do the Householder transformations
! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
do
i
=
1
,
nq
,
4
call
hh_trafo_complex_kernel_4_2hv
(
q
(
i
,
1
),
hh
,
nb
,
ldq
,
ldh
,
s
)
enddo
!do i=1,nq-8,12
! call hh_trafo_complex_kernel_12_2hv(q(i,1),hh, nb, ldq, ldh, s)
!enddo
! i > nq-8 now, i.e. at most 8 rows remain
!if(nq-i+1 > 4) then
! call hh_trafo_complex_kernel_8_2hv(q(i,1),hh, nb, ldq, ldh, s)
!else if(nq-i+1 > 0) then
! call hh_trafo_complex_kernel_4_2hv(q(i,1),hh, nb, ldq, ldh, s)
!endif
end
! --------------------------------------------------------------------------------------------------
subroutine
hh_trafo_complex_kernel_12
(
q
,
hh
,
nb
,
ldq
)
subroutine
hh_trafo_complex_kernel_12
(
q
,
hh
,
nb
,
ldq
)
implicit
none
implicit
none
...
@@ -264,3 +309,394 @@ subroutine hh_trafo_complex_kernel_4(q, hh, nb, ldq)
...
@@ -264,3 +309,394 @@ subroutine hh_trafo_complex_kernel_4(q, hh, nb, ldq)
end
end
! --------------------------------------------------------------------------------------------------
! --------------------------------------------------------------------------------------------------
subroutine
hh_trafo_complex_kernel_4_2hv
(
q
,
hh
,
nb
,
ldq
,
ldh
,
s
)
implicit
none
integer
,
intent
(
in
)
::
nb
,
ldq
,
ldh
complex
*
16
,
intent
(
inout
)
::
q
(
ldq
,
*
)
complex
*
16
,
intent
(
in
)
::
hh
(
ldh
,
*
)
complex
*
16
,
intent
(
in
)
::
s
complex
*
16
x1
,
x2
,
x3
,
x4
,
y1
,
y2
,
y3
,
y4
complex
*
16
h1
,
h2
,
tau1
,
tau2
integer
i
x1
=
q
(
1
,
2
)
x2
=
q
(
2
,
2
)
x3
=
q
(
3
,
2
)
x4
=
q
(
4
,
2
)
y1
=
q
(
1
,
1
)
+
q
(
1
,
2
)
*
conjg
(
hh
(
2
,
2
))
y2
=
q
(
2
,
1
)
+
q
(
2
,
2
)
*
conjg
(
hh
(
2
,
2
))
y3
=
q
(
3
,
1
)
+
q
(
3
,
2
)
*
conjg
(
hh
(
2
,
2
))
y4
=
q
(
4
,
1
)
+
q
(
4
,
2
)
*
conjg
(
hh
(
2
,
2
))
!DEC$ VECTOR ALIGNED
do
i
=
3
,
nb
h1
=
conjg
(
hh
(
i
-1
,
1
))
h2
=
conjg
(
hh
(
i
,
2
))
x1
=
x1
+
q
(
1
,
i
)
*
h1
y1
=
y1
+
q
(
1
,
i
)
*
h2
x2
=
x2
+
q
(
2
,
i
)
*
h1
y2
=
y2
+
q
(
2
,
i
)
*
h2
x3
=
x3
+
q
(
3
,
i
)
*
h1
y3
=
y3
+
q
(
3
,
i
)
*
h2
x4
=
x4
+
q
(
4
,
i
)
*
h1
y4
=
y4
+
q
(
4
,
i
)
*
h2
enddo
x1
=
x1
+
q
(
1
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x2
=
x2
+
q
(
2
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x3
=
x3
+
q
(
3
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x4
=
x4
+
q
(
4
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
tau1
=
hh
(
1
,
1
)
tau2
=
hh
(
1
,
2
)
h1
=
-
tau1
x1
=
x1
*
h1
x2
=
x2
*
h1
x3
=
x3
*
h1
x4
=
x4
*
h1
h1
=
-
tau2
h2
=
-
tau2
*
s
y1
=
y1
*
h1
+
x1
*
h2
y2
=
y2
*
h1
+
x2
*
h2
y3
=
y3
*
h1
+
x3
*
h2
y4
=
y4
*
h1
+
x4
*
h2
q
(
1
,
1
)
=
q
(
1
,
1
)
+
y1
q
(
2
,
1
)
=
q
(
2
,
1
)
+
y2
q
(
3
,
1
)
=
q
(
3
,
1
)
+
y3
q
(
4
,
1
)
=
q
(
4
,
1
)
+
y4
q
(
1
,
2
)
=
q
(
1
,
2
)
+
x1
+
y1
*
hh
(
2
,
2
)
q
(
2
,
2
)
=
q
(
2
,
2
)
+
x2
+
y2
*
hh
(
2
,
2
)
q
(
3
,
2
)
=
q
(
3
,
2
)
+
x3
+
y3
*
hh
(
2
,
2
)
q
(
4
,
2
)
=
q
(
4
,
2
)
+
x4
+
y4
*
hh
(
2
,
2
)
!DEC$ VECTOR ALIGNED
do
i
=
3
,
nb
h1
=
hh
(
i
-1
,
1
)
h2
=
hh
(
i
,
2
)
q
(
1
,
i
)
=
q
(
1
,
i
)
+
x1
*
h1
+
y1
*
h2
q
(
2
,
i
)
=
q
(
2
,
i
)
+
x2
*
h1
+
y2
*
h2
q
(
3
,
i
)
=
q
(
3
,
i
)
+
x3
*
h1
+
y3
*
h2
q
(
4
,
i
)
=
q
(
4
,
i
)
+
x4
*
h1
+
y4
*
h2
enddo
q
(
1
,
nb
+1
)
=
q
(
1
,
nb
+1
)
+
x1
*
hh
(
nb
,
1
)
q
(
2
,
nb
+1
)
=
q
(
2
,
nb
+1
)
+
x2
*
hh
(
nb
,
1
)
q
(
3
,
nb
+1
)
=
q
(
3
,
nb
+1
)
+
x3
*
hh
(
nb
,
1
)
q
(
4
,
nb
+1
)
=
q
(
4
,
nb
+1
)
+
x4
*
hh
(
nb
,
1
)
end
! --------------------------------------------------------------------------------------------------
subroutine
hh_trafo_complex_kernel_8_2hv
(
q
,
hh
,
nb
,
ldq
,
ldh
,
s
)
implicit
none
integer
,
intent
(
in
)
::
nb
,
ldq
,
ldh
complex
*
16
,
intent
(
inout
)
::
q
(
ldq
,
*
)
complex
*
16
,
intent
(
in
)
::
hh
(
ldh
,
*
)
complex
*
16
,
intent
(
in
)
::
s
complex
*
16
x1
,
x2
,
x3
,
x4
,
x5
,
x6
,
x7
,
x8
,
y1
,
y2
,
y3
,
y4
,
y5
,
y6
,
y7
,
y8
complex
*
16
h1
,
h2
,
tau1
,
tau2
integer
i
x1
=
q
(
1
,
2
)
x2
=
q
(
2
,
2
)
x3
=
q
(
3
,
2
)
x4
=
q
(
4
,
2
)
x5
=
q
(
5
,
2
)
x6
=
q
(
6
,
2
)
x7
=
q
(
7
,
2
)
x8
=
q
(
8
,
2
)
y1
=
q
(
1
,
1
)
+
q
(
1
,
2
)
*
conjg
(
hh
(
2
,
2
))
y2
=
q
(
2
,
1
)
+
q
(
2
,
2
)
*
conjg
(
hh
(
2
,
2
))
y3
=
q
(
3
,
1
)
+
q
(
3
,
2
)
*
conjg
(
hh
(
2
,
2
))
y4
=
q
(
4
,
1
)
+
q
(
4
,
2
)
*
conjg
(
hh
(
2
,
2
))
y5
=
q
(
5
,
1
)
+
q
(
5
,
2
)
*
conjg
(
hh
(
2
,
2
))
y6
=
q
(
6
,
1
)
+
q
(
6
,
2
)
*
conjg
(
hh
(
2
,
2
))
y7
=
q
(
7
,
1
)
+
q
(
7
,
2
)
*
conjg
(
hh
(
2
,
2
))
y8
=
q
(
8
,
1
)
+
q
(
8
,
2
)
*
conjg
(
hh
(
2
,
2
))
!DEC$ VECTOR ALIGNED
do
i
=
3
,
nb
h1
=
conjg
(
hh
(
i
-1
,
1
))
h2
=
conjg
(
hh
(
i
,
2
))
x1
=
x1
+
q
(
1
,
i
)
*
h1
y1
=
y1
+
q
(
1
,
i
)
*
h2
x2
=
x2
+
q
(
2
,
i
)
*
h1
y2
=
y2
+
q
(
2
,
i
)
*
h2
x3
=
x3
+
q
(
3
,
i
)
*
h1
y3
=
y3
+
q
(
3
,
i
)
*
h2
x4
=
x4
+
q
(
4
,
i
)
*
h1
y4
=
y4
+
q
(
4
,
i
)
*
h2
x5
=
x5
+
q
(
5
,
i
)
*
h1
y5
=
y5
+
q
(
5
,
i
)
*
h2
x6
=
x6
+
q
(
6
,
i
)
*
h1
y6
=
y6
+
q
(
6
,
i
)
*
h2
x7
=
x7
+
q
(
7
,
i
)
*
h1
y7
=
y7
+
q
(
7
,
i
)
*
h2
x8
=
x8
+
q
(
8
,
i
)
*
h1
y8
=
y8
+
q
(
8
,
i
)
*
h2
enddo
x1
=
x1
+
q
(
1
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x2
=
x2
+
q
(
2
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x3
=
x3
+
q
(
3
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x4
=
x4
+
q
(
4
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x5
=
x5
+
q
(
5
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x6
=
x6
+
q
(
6
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x7
=
x7
+
q
(
7
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x8
=
x8
+
q
(
8
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
tau1
=
hh
(
1
,
1
)
tau2
=
hh
(
1
,
2
)
h1
=
-
tau1
x1
=
x1
*
h1
x2
=
x2
*
h1
x3
=
x3
*
h1
x4
=
x4
*
h1
x5
=
x5
*
h1
x6
=
x6
*
h1
x7
=
x7
*
h1
x8
=
x8
*
h1
h1
=
-
tau2
h2
=
-
tau2
*
s
y1
=
y1
*
h1
+
x1
*
h2
y2
=
y2
*
h1
+
x2
*
h2
y3
=
y3
*
h1
+
x3
*
h2
y4
=
y4
*
h1
+
x4
*
h2
y5
=
y5
*
h1
+
x5
*
h2
y6
=
y6
*
h1
+
x6
*
h2
y7
=
y7
*
h1
+
x7
*
h2
y8
=
y8
*
h1
+
x8
*
h2
q
(
1
,
1
)
=
q
(
1
,
1
)
+
y1
q
(
2
,
1
)
=
q
(
2
,
1
)
+
y2
q
(
3
,
1
)
=
q
(
3
,
1
)
+
y3
q
(
4
,
1
)
=
q
(
4
,
1
)
+
y4
q
(
5
,
1
)
=
q
(
5
,
1
)
+
y5
q
(
6
,
1
)
=
q
(
6
,
1
)
+
y6
q
(
7
,
1
)
=
q
(
7
,
1
)
+
y7
q
(
8
,
1
)
=
q
(
8
,
1
)
+
y8
q
(
1
,
2
)
=
q
(
1
,
2
)
+
x1
+
y1
*
hh
(
2
,
2
)
q
(
2
,
2
)
=
q
(
2
,
2
)
+
x2
+
y2
*
hh
(
2
,
2
)
q
(
3
,
2
)
=
q
(
3
,
2
)
+
x3
+
y3
*
hh
(
2
,
2
)
q
(
4
,
2
)
=
q
(
4
,
2
)
+
x4
+
y4
*
hh
(
2
,
2
)
q
(
5
,
2
)
=
q
(
5
,
2
)
+
x5
+
y5
*
hh
(
2
,
2
)
q
(
6
,
2
)
=
q
(
6
,
2
)
+
x6
+
y6
*
hh
(
2
,
2
)
q
(
7
,
2
)
=
q
(
7
,
2
)
+
x7
+
y7
*
hh
(
2
,
2
)
q
(
8
,
2
)
=
q
(
8
,
2
)
+
x8
+
y8
*
hh
(
2
,
2
)
!DEC$ VECTOR ALIGNED
do
i
=
3
,
nb
h1
=
hh
(
i
-1
,
1
)
h2
=
hh
(
i
,
2
)
q
(
1
,
i
)
=
q
(
1
,
i
)
+
x1
*
h1
+
y1
*
h2
q
(
2
,
i
)
=
q
(
2
,
i
)
+
x2
*
h1
+
y2
*
h2
q
(
3
,
i
)
=
q
(
3
,
i
)
+
x3
*
h1
+
y3
*
h2
q
(
4
,
i
)
=
q
(
4
,
i
)
+
x4
*
h1
+
y4
*
h2
q
(
5
,
i
)
=
q
(
5
,
i
)
+
x5
*
h1
+
y5
*
h2
q
(
6
,
i
)
=
q
(
6
,
i
)
+
x6
*
h1
+
y6
*
h2
q
(
7
,
i
)
=
q
(
7
,
i
)
+
x7
*
h1
+
y7
*
h2
q
(
8
,
i
)
=
q
(
8
,
i
)
+
x8
*
h1
+
y8
*
h2
enddo
q
(
1
,
nb
+1
)
=
q
(
1
,
nb
+1
)
+
x1
*
hh
(
nb
,
1
)
q
(
2
,
nb
+1
)
=
q
(
2
,
nb
+1
)
+
x2
*
hh
(
nb
,
1
)
q
(
3
,
nb
+1
)
=
q
(
3
,
nb
+1
)
+
x3
*
hh
(
nb
,
1
)
q
(
4
,
nb
+1
)
=
q
(
4
,
nb
+1
)
+
x4
*
hh
(
nb
,
1
)
q
(
5
,
nb
+1
)
=
q
(
5
,
nb
+1
)
+
x5
*
hh
(
nb
,
1
)
q
(
6
,
nb
+1
)
=
q
(
6
,
nb
+1
)
+
x6
*
hh
(
nb
,
1
)
q
(
7
,
nb
+1
)
=
q
(
7
,
nb
+1
)
+
x7
*
hh
(
nb
,
1
)
q
(
8
,
nb
+1
)
=
q
(
8
,
nb
+1
)
+
x8
*
hh
(
nb
,
1
)
end
! --------------------------------------------------------------------------------------------------
subroutine
hh_trafo_complex_kernel_12_2hv
(
q
,
hh
,
nb
,
ldq
,
ldh
,
s
)
implicit
none
integer
,
intent
(
in
)
::
nb
,
ldq
,
ldh
complex
*
16
,
intent
(
inout
)
::
q
(
ldq
,
*
)
complex
*
16
,
intent
(
in
)
::
hh
(
ldh
,
*
)
complex
*
16
,
intent
(
in
)
::
s
complex
*
16
x1
,
x2
,
x3
,
x4
,
x5
,
x6
,
x7
,
x8
,
x9
,
x10
,
x11
,
x12
,
y1
,
y2
,
y3
,
y4
,
y5
,
y6
,
y7
,
y8
,
y9
,
y10
,
y11
,
y12
complex
*
16
h1
,
h2
,
tau1
,
tau2
integer
i
x1
=
q
(
1
,
2
)
x2
=
q
(
2
,
2
)
x3
=
q
(
3
,
2
)
x4
=
q
(
4
,
2
)
x5
=
q
(
5
,
2
)
x6
=
q
(
6
,
2
)
x7
=
q
(
7
,
2
)
x8
=
q
(
8
,
2
)
x9
=
q
(
9
,
2
)
x10
=
q
(
10
,
2
)
x11
=
q
(
11
,
2
)
x12
=
q
(
12
,
2
)
y1
=
q
(
1
,
1
)
+
q
(
1
,
2
)
*
conjg
(
hh
(
2
,
2
))
y2
=
q
(
2
,
1
)
+
q
(
2
,
2
)
*
conjg
(
hh
(
2
,
2
))
y3
=
q
(
3
,
1
)
+
q
(
3
,
2
)
*
conjg
(
hh
(
2
,
2
))
y4
=
q
(
4
,
1
)
+
q
(
4
,
2
)
*
conjg
(
hh
(
2
,
2
))
y5
=
q
(
5
,
1
)
+
q
(
5
,
2
)
*
conjg
(
hh
(
2
,
2
))
y6
=
q
(
6
,
1
)
+
q
(
6
,
2
)
*
conjg
(
hh
(
2
,
2
))
y7
=
q
(
7
,
1
)
+
q
(
7
,
2
)
*
conjg
(
hh
(
2
,
2
))
y8
=
q
(
8
,
1
)
+
q
(
8
,
2
)
*
conjg
(
hh
(
2
,
2
))
y9
=
q
(
9
,
1
)
+
q
(
9
,
2
)
*
conjg
(
hh
(
2
,
2
))
y10
=
q
(
10
,
1
)
+
q
(
10
,
2
)
*
conjg
(
hh
(
2
,
2
))
y11
=
q
(
11
,
1
)
+
q
(
11
,
2
)
*
conjg
(
hh
(
2
,
2
))
y12
=
q
(
12
,
1
)
+
q
(
12
,
2
)
*
conjg
(
hh
(
2
,
2
))
!DEC$ VECTOR ALIGNED
do
i
=
3
,
nb
h1
=
conjg
(
hh
(
i
-1
,
1
))
h2
=
conjg
(
hh
(
i
,
2
))
x1
=
x1
+
q
(
1
,
i
)
*
h1
y1
=
y1
+
q
(
1
,
i
)
*
h2
x2
=
x2
+
q
(
2
,
i
)
*
h1
y2
=
y2
+
q
(
2
,
i
)
*
h2
x3
=
x3
+
q
(
3
,
i
)
*
h1
y3
=
y3
+
q
(
3
,
i
)
*
h2
x4
=
x4
+
q
(
4
,
i
)
*
h1
y4
=
y4
+
q
(
4
,
i
)
*
h2
x5
=
x5
+
q
(
5
,
i
)
*
h1
y5
=
y5
+
q
(
5
,
i
)
*
h2
x6
=
x6
+
q
(
6
,
i
)
*
h1
y6
=
y6
+
q
(
6
,
i
)
*
h2
x7
=
x7
+
q
(
7
,
i
)
*
h1
y7
=
y7
+
q
(
7
,
i
)
*
h2
x8
=
x8
+
q
(
8
,
i
)
*
h1
y8
=
y8
+
q
(
8
,
i
)
*
h2
x9
=
x9
+
q
(
9
,
i
)
*
h1
y9
=
y9
+
q
(
9
,
i
)
*
h2
x10
=
x10
+
q
(
10
,
i
)
*
h1
y10
=
y10
+
q
(
10
,
i
)
*
h2
x11
=
x11
+
q
(
11
,
i
)
*
h1
y11
=
y11
+
q
(
11
,
i
)
*
h2
x12
=
x12
+
q
(
12
,
i
)
*
h1
y12
=
y12
+
q
(
12
,
i
)
*
h2
enddo
x1
=
x1
+
q
(
1
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x2
=
x2
+
q
(
2
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x3
=
x3
+
q
(
3
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x4
=
x4
+
q
(
4
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x5
=
x5
+
q
(
5
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x6
=
x6
+
q
(
6
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x7
=
x7
+
q
(
7
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x8
=
x8
+
q
(
8
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x9
=
x9
+
q
(
9
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x10
=
x10
+
q
(
10
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x11
=
x11
+
q
(
11
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
x12
=
x12
+
q
(
12
,
nb
+1
)
*
conjg
(
hh
(
nb
,
1
))
tau1
=
hh
(
1
,
1
)
tau2
=
hh
(
1
,
2
)
h1
=
-
tau1
x1
=
x1
*
h1
x2
=
x2
*
h1
x3
=
x3
*
h1
x4
=
x4
*
h1
x5
=
x5
*
h1
x6
=
x6
*
h1
x7
=
x7
*
h1
x8
=
x8
*
h1
x9
=
x9
*
h1
x10
=
x10
*
h1
x11
=
x11
*
h1
x12
=
x12
*
h1
h1
=
-
tau2
h2
=
-
tau2
*
s
y1
=
y1
*
h1
+
x1
*
h2
y2
=
y2
*
h1
+
x2
*
h2
y3
=
y3
*
h1
+
x3
*
h2
y4
=
y4
*
h1
+
x4
*
h2
y5
=
y5
*
h1
+
x5
*
h2
y6
=
y6
*
h1
+
x6
*
h2
y7
=
y7
*
h1
+
x7
*
h2
y8
=
y8
*
h1
+
x8
*
h2
y9
=
y9
*
h1
+
x9
*
h2
y10
=
y10
*
h1
+
x10
*
h2
y11
=
y11
*
h1
+
x11
*
h2
y12
=
y12
*
h1
+
x12
*
h2
q
(
1
,
1
)
=
q
(
1
,
1
)
+
y1
q
(
2
,
1
)
=
q
(
2
,
1
)
+
y2
q
(
3
,
1
)
=
q
(
3
,
1
)
+
y3
q
(
4
,
1
)
=
q
(
4
,
1
)
+
y4
q
(
5
,
1
)
=
q
(
5
,
1
)
+
y5
q
(
6
,
1
)
=
q
(
6
,
1
)
+
y6
q
(
7
,
1
)
=
q
(
7
,
1
)
+
y7
q
(
8
,
1
)
=
q
(
8
,
1
)
+
y8
q
(
9
,
1
)
=
q
(
9
,
1
)
+
y9
q
(
10
,
1
)
=
q
(
10
,
1
)
+
y10
q
(
11
,
1
)
=
q
(
11
,
1
)
+
y11
q
(
12
,
1
)
=
q
(
12
,
1
)
+
y12
q
(
1
,
2
)
=
q
(
1
,
2
)
+
x1
+
y1
*
hh
(
2
,
2
)
q
(
2
,
2
)
=
q
(
2
,
2
)
+
x2
+
y2
*
hh
(