Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
b1fe112f
Commit
b1fe112f
authored
Apr 12, 2016
by
Andreas Marek
Browse files
Merge branch 'master' into ELPA_GPU
parents
5093e483
9ef8709f
Changes
6
Hide whitespace changes
Inline
Side-by-side
Makefile.am
View file @
b1fe112f
...
...
@@ -25,6 +25,7 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.F90 \
src/mod_compute_hh_trafo_real.F90
\
src/mod_compute_hh_trafo_complex.F90
\
src/mod_pack_unpack_complex.F90
\
src/aligned_mem.F90
\
src/elpa2_compute.F90
\
src/elpa2.F90
\
src/elpa_c_interface.F90
\
...
...
src/aligned_mem.F90
0 → 100644
View file @
b1fe112f
module
aligned_mem
use
,
intrinsic
::
iso_c_binding
interface
function
posix_memalign
(
memptr
,
alignment
,
size
)
result
(
error
)
bind
(
C
,
name
=
"posix_memalign"
)
import
c_int
,
c_size_t
,
c_ptr
integer
(
kind
=
c_int
)
::
error
type
(
c_ptr
),
intent
(
inout
)
::
memptr
integer
(
kind
=
c_size_t
),
intent
(
in
),
value
::
alignment
,
size
end
function
end
interface
interface
subroutine
free
(
ptr
)
bind
(
C
,
name
=
"free"
)
import
c_ptr
type
(
c_ptr
),
value
::
ptr
end
subroutine
end
interface
end
module
src/elpa2_compute.F90
View file @
b1fe112f
...
...
@@ -73,6 +73,7 @@ module ELPA2_compute
use
elpa_pdgeqrf
use
precision
use
elpa_mpi
use
aligned_mem
implicit
none
...
...
src/elpa2_compute_complex_template.X90
View file @
b1fe112f
...
...
@@ -2995,11 +2995,12 @@
logical :: flag
#ifdef WITH_OPENMP
complex(kind=COMPLEX_DATATYPE),
allocatable
:: a(:,:,:,:)
, row(:)
complex(kind=COMPLEX_DATATYPE),
pointer
:: a(:,:,:,:)
#else
complex(kind=COMPLEX_DATATYPE),
allocatable
:: a(:,:,:)
, row(:)
complex(kind=COMPLEX_DATATYPE),
pointer
:: a(:,:,:)
#endif
type(c_ptr) :: a_ptr
complex(kind=COMPLEX_DATATYPE), allocatable :: row(:)
complex(kind=COMPLEX_DATATYPE), allocatable :: row_group(:,:)
#ifdef WITH_OPENMP
...
...
@@ -3213,24 +3214,29 @@
endif
if (.not.(useGPU)) then
allocate(a(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a(1,1,1,1))) /= 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating a "//errorMessage
stop
endif
call c_f_pointer(a_ptr, a, [stripe_width,a_dim2,stripe_count,max_threads] )
! allocate(a(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
! a(:,:,:,:) should be set to 0 in a parallel region, not here!
endif
#else /* OpenMP */
if (.not.(useGPU)) then
allocate(a(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a(1,1,1))) /= 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating a "//errorMessage
stop
endif
call c_f_pointer(a_ptr, a, [stripe_width,a_dim2,stripe_count] )
! allocate(a(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
a(:,:,:) = 0
endif
...
...
@@ -5368,11 +5374,13 @@
! deallocate all working space
if (.not.(useGPU)) then
deallocate(a, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating a "//errorMessage
stop
endif
nullify(a)
call free(a_ptr)
! deallocate(a, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"trans_ev_tridi_to_band_complex: error deallocating a "//errorMessage
! stop
! endif
endif
deallocate(row, stat=istat, errmsg=errorMessage)
...
...
src/elpa2_compute_real_template.X90
View file @
b1fe112f
...
...
@@ -3410,11 +3410,12 @@
logical :: flag
#ifdef WITH_OPENMP
real(kind=REAL_DATATYPE),
allocatable
:: a(:,:,:,:)
, row(:)
real(kind=REAL_DATATYPE),
pointer
:: a(:,:,:,:)
#else
real(kind=REAL_DATATYPE),
allocatable
:: a(:,:,:)
, row(:)
real(kind=REAL_DATATYPE),
pointer
:: a(:,:,:)
#endif
type(c_ptr) :: a_ptr
real(kind=REAL_DATATYPE) , allocatable :: row(:)
real(kind=REAL_DATATYPE) , allocatable :: row_group(:,:)
#ifdef WITH_OPENMP
...
...
@@ -3592,21 +3593,29 @@
endif
else ! GPUs are not used
#if 0
!DEC$ ATTRIBUTES ALIGN: 64:: a
#endif
#ifdef WITH_OPENMP
allocate(a(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a(1,1,1,1))) /= 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating a"//errorMessage
stop
endif
call c_f_pointer(a_ptr, a, [stripe_width,a_dim2,stripe_count,max_threads])
! allocate(a(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
! a(:,:,:,:) should be set to 0 in a parallel region, not here!
#else
allocate(a(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a(1,1,1))) /= 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating a"//errorMessage
stop
endif
call c_f_pointer(a_ptr, a,[stripe_width,a_dim2,stripe_count] )
!allocate(a(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
#ifdef DOUBLE_PRECISION_REAL
a(:,:,:) = 0._rk8
#else
...
...
@@ -5563,11 +5572,13 @@
! deallocate all working space
if (.not.(useGPU)) then
deallocate(a, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating a "//errorMessage
stop
endif
nullify(a)
call free(a_ptr)
! deallocate(a, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"trans_ev_tridi_to_band_real: error when deallocating a "//errorMessage
! stop
! endif
endif
deallocate(row, stat=istat, errmsg=errorMessage)
...
...
src/mod_compute_hh_trafo_real.F90
View file @
b1fe112f
...
...
@@ -115,11 +115,11 @@ module compute_hh_trafo_real
#ifndef WITH_OPENMP
integer
(
kind
=
ik
),
intent
(
in
)
::
last_stripe_width
! real(kind=rk8) :: a(stripe_width,a_dim2,stripe_count)
real
(
kind
=
rk8
),
allocatable
::
a
(:,:,:)
real
(
kind
=
rk8
),
pointer
::
a
(:,:,:)
#else
integer
(
kind
=
ik
),
intent
(
in
)
::
max_threads
,
l_nev
,
thread_width
! real(kind=rk8) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real
(
kind
=
rk8
),
allocatable
::
a
(:,:,:,:)
real
(
kind
=
rk8
),
pointer
::
a
(:,:,:,:)
#endif
integer
(
kind
=
ik
),
intent
(
in
)
::
THIS_REAL_ELPA_KERNEL
...
...
@@ -377,7 +377,7 @@ module compute_hh_trafo_real
!#if defined(WITH_AVX_SANDYBRIDGE)
! call double_hh_trafo_real_
avx
_avx
2
_2hv(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
! call double_hh_trafo_real_
sse
_avx_2hv(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
!#endif
#ifdef WITH_OPENMP
...
...
@@ -407,10 +407,10 @@ module compute_hh_trafo_real
w
(:,
3
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-2
)
w
(:,
4
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-3
)
#ifdef WITH_OPENMP
call
quad_hh_trafo_real_
sse
_avx_4hv
_double
(
a
(
1
,
j
+
off
+
a_off
-3
,
istripe
,
my_thread
),
w
,
&
call
quad_hh_trafo_real_
avx
_avx
2
_4hv
(
a
(
1
,
j
+
off
+
a_off
-3
,
istripe
,
my_thread
),
w
,
&
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
quad_hh_trafo_real_
sse
_avx_4hv
_double
(
a
(
1
,
j
+
off
+
a_off
-3
,
istripe
),
w
,
&
call
quad_hh_trafo_real_
avx
_avx
2
_4hv
(
a
(
1
,
j
+
off
+
a_off
-3
,
istripe
),
w
,
&
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
...
...
@@ -418,10 +418,10 @@ module compute_hh_trafo_real
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_real_
sse
_avx_2hv
_double
(
a
(
1
,
jj
+
off
+
a_off
-1
,
istripe
,
my_thread
),
&
call
double_hh_trafo_real_
avx
_avx
2
_2hv
(
a
(
1
,
jj
+
off
+
a_off
-1
,
istripe
,
my_thread
),
&
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_real_
sse
_avx_2hv
_double
(
a
(
1
,
jj
+
off
+
a_off
-1
,
istripe
),
&
call
double_hh_trafo_real_
avx
_avx
2
_2hv
(
a
(
1
,
jj
+
off
+
a_off
-1
,
istripe
),
&
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
...
...
@@ -451,10 +451,10 @@ module compute_hh_trafo_real
w
(:,
5
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-4
)
w
(:,
6
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-5
)
#ifdef WITH_OPENMP
call
hexa_hh_trafo_real_
sse
_avx_6hv
_double
(
a
(
1
,
j
+
off
+
a_off
-5
,
istripe
,
my_thread
),
w
,
&
call
hexa_hh_trafo_real_
avx
_avx
2
_6hv
(
a
(
1
,
j
+
off
+
a_off
-5
,
istripe
,
my_thread
),
w
,
&
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
hexa_hh_trafo_real_
sse
_avx_6hv
_double
(
a
(
1
,
j
+
off
+
a_off
-5
,
istripe
),
w
,
&
call
hexa_hh_trafo_real_
avx
_avx
2
_6hv
(
a
(
1
,
j
+
off
+
a_off
-5
,
istripe
),
w
,
&
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
...
...
@@ -464,10 +464,10 @@ module compute_hh_trafo_real
w
(:,
3
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
-2
)
w
(:,
4
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
-3
)
#ifdef WITH_OPENMP
call
quad_hh_trafo_real_
sse
_avx_4hv
_double
(
a
(
1
,
jj
+
off
+
a_off
-3
,
istripe
,
my_thread
),
w
,
&
call
quad_hh_trafo_real_
avx
_avx
2
_4hv
(
a
(
1
,
jj
+
off
+
a_off
-3
,
istripe
,
my_thread
),
w
,
&
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
quad_hh_trafo_real_
sse
_avx_4hv
_double
(
a
(
1
,
jj
+
off
+
a_off
-3
,
istripe
),
w
,
&
call
quad_hh_trafo_real_
avx
_avx
2
_4hv
(
a
(
1
,
jj
+
off
+
a_off
-3
,
istripe
),
w
,
&
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
...
...
@@ -475,10 +475,10 @@ module compute_hh_trafo_real
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
jjj
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
jjj
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_real_
sse
_avx_2hv
_double
(
a
(
1
,
jjj
+
off
+
a_off
-1
,
istripe
,
my_thread
),
&
call
double_hh_trafo_real_
avx
_avx
2
_2hv
(
a
(
1
,
jjj
+
off
+
a_off
-1
,
istripe
,
my_thread
),
&
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_real_
sse
_avx_2hv
_double
(
a
(
1
,
jjj
+
off
+
a_off
-1
,
istripe
),
&
call
double_hh_trafo_real_
avx
_avx
2
_2hv
(
a
(
1
,
jjj
+
off
+
a_off
-1
,
istripe
),
&
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
...
...
@@ -754,10 +754,10 @@ module compute_hh_trafo_real
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_real_
sse
_avx_2hv_single
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
),
&
call
double_hh_trafo_real_
avx
_avx
2
_2hv_single
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
),
&
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_real_
sse
_avx_2hv_single
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
),
&
call
double_hh_trafo_real_
avx
_avx
2
_2hv_single
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
),
&
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
...
...
@@ -1000,10 +1000,10 @@ module compute_hh_trafo_real
w
(:,
4
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
-3
)
#ifdef WITH_OPENMP
call
quad_hh_trafo_real_
avx
_avx
2
_4hv
(
a
(
1
,
jj
+
off
+
a_off
-3
,
istripe
,
my_thread
),
w
,
&
call
quad_hh_trafo_real_
sse
_avx_4hv
(
a
(
1
,
jj
+
off
+
a_off
-3
,
istripe
,
my_thread
),
w
,
&
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
quad_hh_trafo_real_
avx
_avx
2
_4hv
(
a
(
1
,
jj
+
off
+
a_off
-3
,
istripe
),
w
,
&
call
quad_hh_trafo_real_
sse
_avx_4hv
(
a
(
1
,
jj
+
off
+
a_off
-3
,
istripe
),
w
,
&
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment