Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
780e4570
Commit
780e4570
authored
Jul 25, 2013
by
Andreas Marek
Browse files
Rename the kernel file name to fit to our naming onvention
parent
5893789f
Changes
7
Hide whitespace changes
Inline
Side-by-side
ELPA_2013.02.BETA/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
0 → 100644
View file @
780e4570
#
--------------------------------------------------------------------------------------------------
#
#
This
file
contains
the
compute
intensive
kernels
for
the
Householder
transformations
,
#
coded
in
x86_64
assembler
and
using
SSE2
/
SSE3
instructions
.
#
#
It
must
be
assembled
with
GNU
assembler
(
just
"as"
on
most
Linux
machines
)
#
#
Copyright
of
the
original
code
rests
with
the
authors
inside
the
ELPA
#
consortium
.
The
copyright
of
any
additional
modifications
shall
rest
#
with
their
original
authors
,
but
shall
adhere
to
the
licensing
terms
#
distributed
along
with
the
original
code
in
the
file
"COPYING"
.
#
#
--------------------------------------------------------------------------------------------------
.
globl
double_hh_trafo_
.
globl
single_hh_trafo_complex_
.
text
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
.
macro
hh_trafo_real
nrows
#
When
this
macro
is
called
,
the
following
registers
are
set
and
must
not
be
changed
#
%
rdi
:
Address
of
q
#
%
rsi
:
Address
of
hh
#
%
rdx
:
nb
#
%
rcx
:
Remaining
rows
nq
#
%
r8
:
ldq
in
bytes
#
%
r9
:
ldh
in
bytes
#
%
rax
:
address
of
hh
at
the
end
of
the
loops
#
The
top
of
the
stack
must
contain
the
dot
product
of
the
two
Householder
vectors
movq
%
rdi
,
%
r10
#
Copy
address
of
q
movq
%
rsi
,
%
r11
#
Copy
address
of
hh
#
x1
=
q
(
1
,
2
)
#
x2
=
q
(
2
,
2
)
#
#
y1
=
q
(
1
,
1
)
+
q
(
1
,
2
)*
hh
(
2
,
2
)
#
y2
=
q
(
2
,
1
)
+
q
(
2
,
2
)*
hh
(
2
,
2
)
movaps
(%
r10
),
%
xmm6
#
y1
=
q
(
1
,
1
)
movaps
16
(%
r10
),
%
xmm7
#
y2
=
q
(
2
,
1
)
.
if
\
nrows
>=
8
movaps
32
(%
r10
),
%
xmm8
movaps
48
(%
r10
),
%
xmm9
.
if
\
nrows
==
12
movaps
64
(%
r10
),
%
xmm10
movaps
80
(%
r10
),
%
xmm11
.
endif
.
endif
addq
%
r8
,
%
r10
#
%
r10
=>
q
(
.
,
2
)
movddup
8
(%
r11
,%
r9
),
%
xmm15
#
hh
(
2
,
2
)
.
macro
mac_pre_loop1
qoff
,
X
,
Y
movaps
\
qoff
(%
r10
),
\
X
#
xn
=
q
(
n
,
2
)
movaps
\
X
,
%
xmm12
mulpd
%
xmm15
,
%
xmm12
addpd
%
xmm12
,
\
Y
#
yn
=
yn
+
xn
*
h
(
2
,
2
)
.
endm
mac_pre_loop1
0
,
%
xmm0
,
%
xmm6
mac_pre_loop1
16
,
%
xmm1
,
%
xmm7
.
if
\
nrows
>=
8
mac_pre_loop1
32
,
%
xmm2
,
%
xmm8
mac_pre_loop1
48
,
%
xmm3
,
%
xmm9
.
if
\
nrows
==
12
mac_pre_loop1
64
,
%
xmm4
,
%
xmm10
mac_pre_loop1
80
,
%
xmm5
,
%
xmm11
.
endif
.
endif
.
purgem
mac_pre_loop1
#
do
i
=
3
,
nb
#
h1
=
hh
(
i
-
1
,
1
)
#
h2
=
hh
(
i
,
2
)
#
x1
=
x1
+
q
(
1
,
i
)*
h1
#
y1
=
y1
+
q
(
1
,
i
)*
h2
#
x2
=
x2
+
q
(
2
,
i
)*
h1
#
y2
=
y2
+
q
(
2
,
i
)*
h2
#
...
#
enddo
addq
$
8
,
%
r11
.
align
16
1
:
cmpq
%
rax
,
%
r11
#
Jump
out
of
the
loop
if
%
r11
>=
%
rax
jge
2
f
addq
%
r8
,
%
r10
#
%
r10
=>
q
(
.
,
i
)
movddup
(%
r11
),
%
xmm14
#
hh
(
i
-
1
,
1
)
movddup
8
(%
r11
,%
r9
),
%
xmm15
#
hh
(
i
,
2
)
.
macro
mac_loop1
qoff
,
X
,
Y
movaps
\
qoff
(%
r10
),
%
xmm13
#
q
(
.
,
i
)
movaps
%
xmm13
,
%
xmm12
mulpd
%
xmm14
,
%
xmm13
addpd
%
xmm13
,
\
X
#
xn
=
xn
+
q
(
.
,
i
)*
h1
mulpd
%
xmm15
,
%
xmm12
addpd
%
xmm12
,
\
Y
#
yn
=
yn
+
q
(
.
,
i
)*
h2
.
endm
mac_loop1
0
,
%
xmm0
,
%
xmm6
mac_loop1
16
,
%
xmm1
,
%
xmm7
.
if
\
nrows
>=
8
mac_loop1
32
,
%
xmm2
,
%
xmm8
mac_loop1
48
,
%
xmm3
,
%
xmm9
.
if
\
nrows
==
12
mac_loop1
64
,
%
xmm4
,
%
xmm10
mac_loop1
80
,
%
xmm5
,
%
xmm11
.
endif
.
endif
.
purgem
mac_loop1
addq
$
8
,
%
r11
jmp
1
b
2
:
#
x1
=
x1
+
q
(
1
,
nb
+
1
)*
hh
(
nb
,
1
)
#
x2
=
x2
+
q
(
2
,
nb
+
1
)*
hh
(
nb
,
1
)
addq
%
r8
,
%
r10
#
%
r10
=>
q
(
.
,
nb
+
1
)
movddup
(%
r11
),
%
xmm14
.
macro
mac_post_loop1
qoff
,
X
movaps
\
qoff
(%
r10
),
%
xmm13
#
q
(
.
,
nb
+
1
)
mulpd
%
xmm14
,
%
xmm13
addpd
%
xmm13
,
\
X
.
endm
mac_post_loop1
0
,
%
xmm0
mac_post_loop1
16
,
%
xmm1
.
if
\
nrows
>=
8
mac_post_loop1
32
,
%
xmm2
mac_post_loop1
48
,
%
xmm3
.
if
\
nrows
==
12
mac_post_loop1
64
,
%
xmm4
mac_post_loop1
80
,
%
xmm5
.
endif
.
endif
.
purgem
mac_post_loop1
#
tau1
=
hh
(
1
,
1
)
#
tau2
=
hh
(
1
,
2
)
#
#
h1
=
-
tau1
#
x1
=
x1
*
h1
#
x2
=
x2
*
h1
movq
%
rsi
,
%
r11
#
restore
%
r11
(
hh
(
1
,
1
))
movddup
(%
r11
),
%
xmm12
#
hh
(
1
,
1
)
xorps
%
xmm14
,
%
xmm14
subpd
%
xmm12
,
%
xmm14
#
%
xmm14
=
-
hh
(
1
,
1
)
mulpd
%
xmm14
,
%
xmm0
mulpd
%
xmm14
,
%
xmm1
.
if
\
nrows
>=
8
mulpd
%
xmm14
,
%
xmm2
mulpd
%
xmm14
,
%
xmm3
.
if
\
nrows
==
12
mulpd
%
xmm14
,
%
xmm4
mulpd
%
xmm14
,
%
xmm5
.
endif
.
endif
#
h1
=
-
tau2
#
h2
=
-
tau2
*
s
#
y1
=
y1
*
h1
+
x1
*
h2
#
y2
=
y2
*
h1
+
x2
*
h2
movddup
(%
r11
,%
r9
),
%
xmm12
#
hh
(
1
,
2
)
xorps
%
xmm15
,
%
xmm15
subpd
%
xmm12
,
%
xmm15
#
%
xmm15
=
-
hh
(
1
,
2
)
=
h1
movaps
%
xmm15
,
%
xmm14
movddup
(%
rsp
),
%
xmm12
#
Get
s
from
top
of
stack
mulpd
%
xmm12
,
%
xmm14
#
%
xmm14
=
h2
.
macro
mac_xform_y
X
,
Y
mulpd
%
xmm15
,
\
Y
#
y1
=
y1
*
h1
movaps
\
X
,
%
xmm12
mulpd
%
xmm14
,
%
xmm12
addpd
%
xmm12
,
\
Y
.
endm
mac_xform_y
%
xmm0
,
%
xmm6
mac_xform_y
%
xmm1
,
%
xmm7
.
if
\
nrows
>=
8
mac_xform_y
%
xmm2
,
%
xmm8
mac_xform_y
%
xmm3
,
%
xmm9
.
if
\
nrows
==
12
mac_xform_y
%
xmm4
,
%
xmm10
mac_xform_y
%
xmm5
,
%
xmm11
.
endif
.
endif
.
purgem
mac_xform_y
#
q
(1,1)
=
q
(
1
,
1
)
+
y1
#
q
(2,1)
=
q
(
2
,
1
)
+
y2
movq
%
rdi
,
%
r10
#
restore
original
Q
.
macro
mac_pre_loop2_1
qoff
,
Y
movaps
\
qoff
(%
r10
),
%
xmm13
#
q
(
.
,
1
)
addpd
\
Y
,
%
xmm13
movaps
%
xmm13
,
\
qoff
(%
r10
)
.
endm
mac_pre_loop2_1
0
,
%
xmm6
mac_pre_loop2_1
16
,
%
xmm7
.
if
\
nrows
>=
8
mac_pre_loop2_1
32
,
%
xmm8
mac_pre_loop2_1
48
,
%
xmm9
.
if
\
nrows
==
12
mac_pre_loop2_1
64
,
%
xmm10
mac_pre_loop2_1
80
,
%
xmm11
.
endif
.
endif
.
purgem
mac_pre_loop2_1
#
q
(1,2)
=
q
(
1
,
2
)
+
x1
+
y1
*
hh
(
2
,
2
)
#
q
(2,2)
=
q
(
2
,
2
)
+
x2
+
y2
*
hh
(
2
,
2
)
addq
%
r8
,
%
r10
#
%
r10
=>
q
(
.
,
2
)
movddup
8
(%
r11
,%
r9
),
%
xmm15
#
hh
(
2
,
2
)
.
macro
mac_pre_loop2_2
qoff
,
X
,
Y
movaps
\
X
,
%
xmm13
movaps
\
Y
,
%
xmm12
mulpd
%
xmm15
,
%
xmm12
addpd
%
xmm12
,
%
xmm13
addpd
\
qoff
(%
r10
),
%
xmm13
movaps
%
xmm13
,
\
qoff
(%
r10
)
.
endm
mac_pre_loop2_2
0
,
%
xmm0
,
%
xmm6
mac_pre_loop2_2
16
,
%
xmm1
,
%
xmm7
.
if
\
nrows
>=
8
mac_pre_loop2_2
32
,
%
xmm2
,
%
xmm8
mac_pre_loop2_2
48
,
%
xmm3
,
%
xmm9
.
if
\
nrows
==
12
mac_pre_loop2_2
64
,
%
xmm4
,
%
xmm10
mac_pre_loop2_2
80
,
%
xmm5
,
%
xmm11
.
endif
.
endif
.
purgem
mac_pre_loop2_2
#
do
i
=
3
,
nb
#
h1
=
hh
(
i
-
1
,
1
)
#
h2
=
hh
(
i
,
2
)
#
q
(1,
i
)
=
q
(
1
,
i
)
+
x1
*
h1
+
y1
*
h2
#
q
(2,
i
)
=
q
(
2
,
i
)
+
x2
*
h1
+
y2
*
h2
#
enddo
addq
$
8
,
%
r11
.
align
16
1
:
cmpq
%
rax
,
%
r11
#
Jump
out
of
the
loop
if
%
r11
>=
%
rax
jge
2
f
addq
%
r8
,
%
r10
#
%
r10
=>
q
(
.
,
i
)
movddup
(%
r11
),
%
xmm14
#
hh
(
i
-
1
,
1
)
movddup
8
(%
r11
,%
r9
),
%
xmm15
#
hh
(
i
,
2
)
.
macro
mac_loop2
qoff
,
X
,
Y
movaps
\
X
,
%
xmm13
mulpd
%
xmm14
,
%
xmm13
movaps
\
Y
,
%
xmm12
mulpd
%
xmm15
,
%
xmm12
addpd
%
xmm12
,
%
xmm13
addpd
\
qoff
(%
r10
),
%
xmm13
movaps
%
xmm13
,
\
qoff
(%
r10
)
.
endm
mac_loop2
0
,
%
xmm0
,
%
xmm6
mac_loop2
16
,
%
xmm1
,
%
xmm7
.
if
\
nrows
>=
8
mac_loop2
32
,
%
xmm2
,
%
xmm8
mac_loop2
48
,
%
xmm3
,
%
xmm9
.
if
\
nrows
==
12
mac_loop2
64
,
%
xmm4
,
%
xmm10
mac_loop2
80
,
%
xmm5
,
%
xmm11
.
endif
.
endif
.
purgem
mac_loop2
addq
$
8
,
%
r11
jmp
1
b
2
:
#
q
(1,
nb
+1)
=
q
(
1
,
nb
+
1
)
+
x1
*
hh
(
nb
,
1
)
#
q
(2,
nb
+1)
=
q
(
2
,
nb
+
1
)
+
x2
*
hh
(
nb
,
1
)
addq
%
r8
,
%
r10
#
%
r10
=>
q
(
.
,
nb
+
1
)
movddup
(%
r11
),
%
xmm14
.
macro
mac_post_loop2
qoff
,
X
movaps
\
qoff
(%
r10
),
%
xmm13
#
q
(
.
,
nb
+
1
)
mulpd
%
xmm14
,
\
X
addpd
\
X
,
%
xmm13
movaps
%
xmm13
,
\
qoff
(%
r10
)
.
endm
mac_post_loop2
0
,
%
xmm0
mac_post_loop2
16
,
%
xmm1
.
if
\
nrows
>=
8
mac_post_loop2
32
,
%
xmm2
mac_post_loop2
48
,
%
xmm3
.
if
\
nrows
==
12
mac_post_loop2
64
,
%
xmm4
mac_post_loop2
80
,
%
xmm5
.
endif
.
endif
.
purgem
mac_post_loop2
.
endm
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#
FORTRAN
Interface
:
#
#
subroutine
double_hh_trafo
(
q
,
hh
,
nb
,
nq
,
ldq
,
ldh
)
#
#
integer
,
intent
(
in
)
::
nb
,
nq
,
ldq
,
ldh
#
real
*8,
intent
(
inout
)
::
q
(
ldq
,*)
#
real
*8,
intent
(
in
)
::
hh
(
ldh
,*)
#
#
Parameter
mapping
to
registers
#
parameter
1
:
%
rdi
:
q
#
parameter
2
:
%
rsi
:
hh
#
parameter
3
:
%
rdx
:
nb
#
parameter
4
:
%
rcx
:
nq
#
parameter
5
:
%
r8
:
ldq
#
parameter
6
:
%
r9
:
ldh
#
#-------------------------------------------------------------------------------
.
align
16
,
0x90
double_hh_trafo_
:
#
Get
integer
parameters
into
corresponding
registers
movslq
(%
rdx
),
%
rdx
#
nb
movslq
(%
rcx
),
%
rcx
#
nq
movslq
(%
r8
),
%
r8
#
ldq
movslq
(%
r9
),
%
r9
#
ldh
#
Get
ldq
in
bytes
addq
%
r8
,
%
r8
addq
%
r8
,
%
r8
addq
%
r8
,
%
r8
#
8
*
ldq
,
i
.
e
.
ldq
in
bytes
#
Get
ldh
in
bytes
addq
%
r9
,
%
r9
addq
%
r9
,
%
r9
addq
%
r9
,
%
r9
#
8
*
ldh
,
i
.
e
.
ldh
in
bytes
#
set
%
rax
to
the
address
of
hh
at
the
end
of
the
loops
,
#
i
.
e
.
if
%
rdx
>=
%
rax
we
must
jump
out
of
the
loop
.
#
please
note
:
%
rax
=
8
*%
rdx
+
%
rsi
-
8
movq
%
rdx
,
%
rax
addq
%
rax
,
%
rax
addq
%
rax
,
%
rax
addq
%
rax
,
%
rax
addq
%
rsi
,
%
rax
subq
$
8
,
%
rax
#-----------------------------------------------------------
#
Calculate
the
dot
product
of
the
two
Householder
vectors
#
decrement
stack
pointer
to
make
space
for
s
subq
$
8
,
%
rsp
#
Fortran
code
:
#
s
=
hh
(
2
,
2
)*
1
#
do
i
=
3
,
nb
#
s
=
s
+
hh
(
i
,
2
)*
hh
(
i
-
1
,
1
)
#
enddo
movq
%
rsi
,
%
r11
#
Copy
address
of
hh
movsd
8
(%
r11
,%
r9
),
%
xmm0
#
hh
(
2
,
2
)
addq
$
8
,
%
r11
1
:
cmpq
%
rax
,
%
r11
jge
2
f
movsd
(%
r11
),
%
xmm14
#
hh
(
i
-
1
,
1
)
movsd
8
(%
r11
,%
r9
),
%
xmm15
#
hh
(
i
,
2
)
mulsd
%
xmm14
,
%
xmm15
addsd
%
xmm15
,
%
xmm0
addq
$
8
,
%
r11
jmp
1
b
2
:
movsd
%
xmm0
,
(%
rsp
)
#
put
s
on
top
of
stack
#-----------------------------------------------------------
rloop_s
:
cmpq
$
8
,
%
rcx
#
if
%
rcx
<=
8
jump
out
of
loop
jle
rloop_e
hh_trafo_real
12
#
transform
12
rows
addq
$
96
,
%
rdi
#
increment
q
start
adress
by
96
bytes
(
6
rows
)
subq
$
12
,
%
rcx
#
decrement
nq
jmp
rloop_s
rloop_e
:
cmpq
$
4
,
%
rcx
#
if
%
rcx
<=
4
jump
to
test_2
jle
test_4
hh_trafo_real
8
#
transform
8
rows
jmp
return1
test_4
:
cmpq
$
0
,
%
rcx
#
if
%
rcx
<=
0
jump
to
return
jle
return1
hh_trafo_real
4
#
transform
4
rows
return1
:
addq
$
8
,
%
rsp
#
reset
stack
pointer
ret
.
align
16
,
0x90
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
.
macro
hh_trafo_complex
nrows
#
When
this
macro
is
called
,
the
following
registers
are
set
and
must
not
be
changed
#
%
rdi
:
Address
of
q
#
%
rsi
:
Address
of
hh
#
%
rdx
:
nb
#
%
rcx
:
Remaining
rows
nq
#
%
r8
:
ldq
in
bytes
movq
%
rdi
,
%
r10
#
Copy
address
of
q
movq
%
rsi
,
%
r11
#
Copy
address
of
hh
#
set
%
rax
to
the
address
of
hh
at
the
end
of
the
loops
,
#
i
.
e
.
if
%
rdx
>=
%
rax
we
must
jump
out
of
the
loop
.
#
please
note
:
%
rax
=
16
*%
rdx
+
%
rsi
movq
%
rdx
,
%
rax
addq
%
rax
,
%
rax
addq
%
rax
,
%
rax
addq
%
rax
,
%
rax
addq
%
rax
,
%
rax
addq
%
rsi
,
%
rax
#
x1
=
q
(
1
,
1
)
; y1 = 0
#
x2
=
q
(
2
,
1
)
; y2 = 0
#
...
movaps
(%
r10
),
%
xmm0
movaps
16
(%
r10
),
%
xmm1
xorps
%
xmm6
,
%
xmm6
xorps
%
xmm7
,
%
xmm7
.
if
\
nrows
>=
4
movaps
32
(%
r10
),
%
xmm2
movaps
48
(%
r10
),
%
xmm3
xorps
%
xmm8
,
%
xmm8
xorps
%
xmm9
,
%
xmm9
.
if
\
nrows
==
6
movaps
64
(%
r10
),
%
xmm4
movaps
80
(%
r10
),
%
xmm5
xorps
%
xmm10
,
%
xmm10
xorps
%
xmm11
,
%
xmm11
.
endif
.
endif
#
do
i
=
2
,
nb
#
h1
=
conjg
(
hh
(
i
))
#
x1
=
x1
+
q
(
1
,
i
)*
h1
#
x2
=
x2
+
q
(
2
,
i
)*
h1
#
...
#
enddo
addq
$
16
,
%
r11
#
%
r11
=>
hh
(
2
)
.
align
16
1
:
cmpq
%
rax
,
%
r11
#
Jump
out
of
the
loop
if
%
r11
>=
%
rax
jge
2
f
addq
%
r8
,
%
r10
#
%
r10
=>
q
(
.
,
i
)
movddup
(%
r11
),
%
xmm14
#
real
(
hh
(
i
))
movddup
8
(%
r11
),
%
xmm15
#
imag
(
hh
(
i
))
.
macro
mac_loop1
qoff
,
X
,
Y
movaps
\
qoff
(%
r10
),
%
xmm13
#
q
(
.
,
i
)
movaps
%
xmm13
,
%
xmm12
mulpd
%
xmm14
,
%
xmm13
#
q
(
.
,
i
)*
real
(
hh
(
i
))
addpd
%
xmm13
,
\
X
#
x1
=
x1
+
q
(
.
,
i
)*
real
(
hh
(
i
))
mulpd
%
xmm15
,
%
xmm12
#
q
(
.
,
i
)*
imag
(
hh
(
i
))
addsubpd
%
xmm12
,
\
Y
#
y1
=
y1
-/+
q
(
.
,
i
)*
imag
(
hh
(
i
))
.
endm
mac_loop1
0
,
%
xmm0
,
%
xmm6
mac_loop1
16
,
%
xmm1
,
%
xmm7
.
if
\
nrows
>=
4
mac_loop1
32
,
%
xmm2
,
%
xmm8
mac_loop1
48
,
%
xmm3
,
%
xmm9
.
if
\
nrows
==
6
mac_loop1
64
,
%
xmm4
,
%
xmm10
mac_loop1
80
,
%
xmm5
,
%
xmm11
.
endif
.
endif
.
purgem
mac_loop1
addq
$
16
,
%
r11
#
%
r11
=>
hh
(
i
+
1
)
jmp
1
b
2
:
#
Now
the
content
of
the
yn
has
to
be
swapped
and
added
to
xn
.
macro
mac_post_loop_1
X
,
Y
shufpd
$
1
,
\
Y
,
\
Y
addpd
\
Y
,
\
X
.
endm
mac_post_loop_1
%
xmm0
,
%
xmm6
mac_post_loop_1
%
xmm1
,
%
xmm7
.
if
\
nrows
>=
4
mac_post_loop_1
%
xmm2
,
%
xmm8
mac_post_loop_1
%
xmm3
,
%
xmm9
.
if
\
nrows
==
6
mac_post_loop_1
%
xmm4
,
%
xmm10
mac_post_loop_1
%
xmm5
,
%
xmm11
.
endif
.
endif
.
purgem
mac_post_loop_1
#
tau1
=
hh
(
1
)
#
#
h1
=
-
tau1
#
x1
=
x1
*
h1
; y1 = x1 with halfes exchanged
#
x2
=
x2
*
h1
; y2 = x2 with halfes exchanged
#
...
movq
%
rsi
,
%
r11
#
restore
address
of
hh
xorps
%
xmm14
,
%
xmm14
movddup
(%
r11
),
%
xmm12
#
real
(
hh
(
1
))
subpd
%
xmm12
,
%
xmm14
#-
real
(
hh
(
1
))
xorps
%
xmm15
,
%
xmm15
movddup
8
(%
r11
),
%
xmm12
#
imag
(
hh
(
1
))
subpd
%
xmm12
,
%
xmm15
#-
imag
(
hh
(
1
))
.
macro
mac_xform
X
,
Y
movaps
\
X
,
%
xmm12
shufpd
$
1
,
\
X
,
%
xmm12
mulpd
%
xmm15
,
%
xmm12
mulpd
%
xmm14
,
\
X
addsubpd
%
xmm12
,
\
X
movaps
\
X
,
\
Y
#
copy
to
y
shufpd
$
1
,
\
X
,
\
Y
#
exchange
halfes
.
endm
mac_xform
%
xmm0
,
%
xmm6
mac_xform
%
xmm1
,
%
xmm7
.
if
\
nrows
>=
4
mac_xform
%
xmm2
,
%
xmm8
mac_xform
%
xmm3
,
%
xmm9
.
if
\
nrows
==
6
mac_xform
%
xmm4
,
%
xmm10
mac_xform
%
xmm5
,
%
xmm11
.
endif
.
endif
.
purgem
mac_xform