Commit b1fe112f authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'master' into ELPA_GPU

parents 5093e483 9ef8709f
......@@ -25,6 +25,7 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.F90 \
src/mod_compute_hh_trafo_real.F90 \
src/mod_compute_hh_trafo_complex.F90 \
src/mod_pack_unpack_complex.F90 \
src/aligned_mem.F90 \
src/elpa2_compute.F90 \
src/elpa2.F90 \
src/elpa_c_interface.F90 \
......
module aligned_mem
use, intrinsic :: iso_c_binding
interface
function posix_memalign(memptr, alignment, size) result(error) bind(C, name="posix_memalign")
import c_int, c_size_t, c_ptr
integer(kind=c_int) :: error
type(c_ptr), intent(inout) :: memptr
integer(kind=c_size_t), intent(in), value :: alignment, size
end function
end interface
interface
subroutine free(ptr) bind(C, name="free")
import c_ptr
type(c_ptr), value :: ptr
end subroutine
end interface
end module
......@@ -73,6 +73,7 @@ module ELPA2_compute
use elpa_pdgeqrf
use precision
use elpa_mpi
use aligned_mem
implicit none
......
......@@ -2995,11 +2995,12 @@
logical :: flag
#ifdef WITH_OPENMP
complex(kind=COMPLEX_DATATYPE), allocatable :: a(:,:,:,:), row(:)
complex(kind=COMPLEX_DATATYPE), pointer :: a(:,:,:,:)
#else
complex(kind=COMPLEX_DATATYPE), allocatable :: a(:,:,:), row(:)
complex(kind=COMPLEX_DATATYPE), pointer :: a(:,:,:)
#endif
type(c_ptr) :: a_ptr
complex(kind=COMPLEX_DATATYPE), allocatable :: row(:)
complex(kind=COMPLEX_DATATYPE), allocatable :: row_group(:,:)
#ifdef WITH_OPENMP
......@@ -3213,24 +3214,29 @@
endif
if (.not.(useGPU)) then
allocate(a(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a(1,1,1,1))) /= 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating a "//errorMessage
stop
endif
call c_f_pointer(a_ptr, a, [stripe_width,a_dim2,stripe_count,max_threads] )
! allocate(a(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
! a(:,:,:,:) should be set to 0 in a parallel region, not here!
endif
#else /* OpenMP */
if (.not.(useGPU)) then
allocate(a(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a(1,1,1))) /= 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating a "//errorMessage
stop
endif
call c_f_pointer(a_ptr, a, [stripe_width,a_dim2,stripe_count] )
! allocate(a(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
a(:,:,:) = 0
endif
......@@ -5368,11 +5374,13 @@
! deallocate all working space
if (.not.(useGPU)) then
deallocate(a, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating a "//errorMessage
stop
endif
nullify(a)
call free(a_ptr)
! deallocate(a, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"trans_ev_tridi_to_band_complex: error deallocating a "//errorMessage
! stop
! endif
endif
deallocate(row, stat=istat, errmsg=errorMessage)
......
......@@ -3410,11 +3410,12 @@
logical :: flag
#ifdef WITH_OPENMP
real(kind=REAL_DATATYPE), allocatable :: a(:,:,:,:), row(:)
real(kind=REAL_DATATYPE), pointer :: a(:,:,:,:)
#else
real(kind=REAL_DATATYPE), allocatable :: a(:,:,:), row(:)
real(kind=REAL_DATATYPE), pointer :: a(:,:,:)
#endif
type(c_ptr) :: a_ptr
real(kind=REAL_DATATYPE) , allocatable :: row(:)
real(kind=REAL_DATATYPE) , allocatable :: row_group(:,:)
#ifdef WITH_OPENMP
......@@ -3592,21 +3593,29 @@
endif
else ! GPUs are not used
#if 0
!DEC$ ATTRIBUTES ALIGN: 64:: a
#endif
#ifdef WITH_OPENMP
allocate(a(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a(1,1,1,1))) /= 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating a"//errorMessage
stop
endif
call c_f_pointer(a_ptr, a, [stripe_width,a_dim2,stripe_count,max_threads])
! allocate(a(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
! a(:,:,:,:) should be set to 0 in a parallel region, not here!
#else
allocate(a(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a(1,1,1))) /= 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating a"//errorMessage
stop
endif
call c_f_pointer(a_ptr, a,[stripe_width,a_dim2,stripe_count] )
!allocate(a(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
#ifdef DOUBLE_PRECISION_REAL
a(:,:,:) = 0._rk8
#else
......@@ -5563,11 +5572,13 @@
! deallocate all working space
if (.not.(useGPU)) then
deallocate(a, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating a "//errorMessage
stop
endif
nullify(a)
call free(a_ptr)
! deallocate(a, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"trans_ev_tridi_to_band_real: error when deallocating a "//errorMessage
! stop
! endif
endif
deallocate(row, stat=istat, errmsg=errorMessage)
......
......@@ -115,11 +115,11 @@ module compute_hh_trafo_real
#ifndef WITH_OPENMP
integer(kind=ik), intent(in) :: last_stripe_width
! real(kind=rk8) :: a(stripe_width,a_dim2,stripe_count)
real(kind=rk8), allocatable :: a(:,:,:)
real(kind=rk8), pointer :: a(:,:,:)
#else
integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
! real(kind=rk8) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real(kind=rk8), allocatable :: a(:,:,:,:)
real(kind=rk8), pointer :: a(:,:,:,:)
#endif
integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL
......@@ -377,7 +377,7 @@ module compute_hh_trafo_real
!#if defined(WITH_AVX_SANDYBRIDGE)
! call double_hh_trafo_real_avx_avx2_2hv(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
! call double_hh_trafo_real_sse_avx_2hv(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
!#endif
#ifdef WITH_OPENMP
......@@ -407,10 +407,10 @@ module compute_hh_trafo_real
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
#ifdef WITH_OPENMP
call quad_hh_trafo_real_sse_avx_4hv_double(a(1,j+off+a_off-3,istripe,my_thread), w, &
call quad_hh_trafo_real_avx_avx2_4hv(a(1,j+off+a_off-3,istripe,my_thread), w, &
nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_real_sse_avx_4hv_double(a(1,j+off+a_off-3,istripe), w, &
call quad_hh_trafo_real_avx_avx2_4hv(a(1,j+off+a_off-3,istripe), w, &
nbw, nl, stripe_width, nbw)
#endif
enddo
......@@ -418,10 +418,10 @@ module compute_hh_trafo_real
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_real_sse_avx_2hv_double(a(1,jj+off+a_off-1,istripe,my_thread), &
call double_hh_trafo_real_avx_avx2_2hv(a(1,jj+off+a_off-1,istripe,my_thread), &
w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_real_sse_avx_2hv_double(a(1,jj+off+a_off-1,istripe), &
call double_hh_trafo_real_avx_avx2_2hv(a(1,jj+off+a_off-1,istripe), &
w, nbw, nl, stripe_width, nbw)
#endif
enddo
......@@ -451,10 +451,10 @@ module compute_hh_trafo_real
w(:,5) = bcast_buffer(1:nbw,j+off-4)
w(:,6) = bcast_buffer(1:nbw,j+off-5)
#ifdef WITH_OPENMP
call hexa_hh_trafo_real_sse_avx_6hv_double(a(1,j+off+a_off-5,istripe,my_thread), w, &
call hexa_hh_trafo_real_avx_avx2_6hv(a(1,j+off+a_off-5,istripe,my_thread), w, &
nbw, nl, stripe_width, nbw)
#else
call hexa_hh_trafo_real_sse_avx_6hv_double(a(1,j+off+a_off-5,istripe), w, &
call hexa_hh_trafo_real_avx_avx2_6hv(a(1,j+off+a_off-5,istripe), w, &
nbw, nl, stripe_width, nbw)
#endif
enddo
......@@ -464,10 +464,10 @@ module compute_hh_trafo_real
w(:,3) = bcast_buffer(1:nbw,jj+off-2)
w(:,4) = bcast_buffer(1:nbw,jj+off-3)
#ifdef WITH_OPENMP
call quad_hh_trafo_real_sse_avx_4hv_double(a(1,jj+off+a_off-3,istripe,my_thread), w, &
call quad_hh_trafo_real_avx_avx2_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, &
nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_real_sse_avx_4hv_double(a(1,jj+off+a_off-3,istripe), w, &
call quad_hh_trafo_real_avx_avx2_4hv(a(1,jj+off+a_off-3,istripe), w, &
nbw, nl, stripe_width, nbw)
#endif
enddo
......@@ -475,10 +475,10 @@ module compute_hh_trafo_real
w(:,1) = bcast_buffer(1:nbw,jjj+off)
w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_real_sse_avx_2hv_double(a(1,jjj+off+a_off-1,istripe,my_thread), &
call double_hh_trafo_real_avx_avx2_2hv(a(1,jjj+off+a_off-1,istripe,my_thread), &
w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_real_sse_avx_2hv_double(a(1,jjj+off+a_off-1,istripe), &
call double_hh_trafo_real_avx_avx2_2hv(a(1,jjj+off+a_off-1,istripe), &
w, nbw, nl, stripe_width, nbw)
#endif
enddo
......@@ -754,10 +754,10 @@ module compute_hh_trafo_real
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_real_sse_avx_2hv_single(a(1,j+off+a_off-1,istripe,my_thread), &
call double_hh_trafo_real_avx_avx2_2hv_single(a(1,j+off+a_off-1,istripe,my_thread), &
w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_real_sse_avx_2hv_single(a(1,j+off+a_off-1,istripe), &
call double_hh_trafo_real_avx_avx2_2hv_single(a(1,j+off+a_off-1,istripe), &
w, nbw, nl, stripe_width, nbw)
#endif
enddo
......@@ -1000,10 +1000,10 @@ module compute_hh_trafo_real
w(:,4) = bcast_buffer(1:nbw,jj+off-3)
#ifdef WITH_OPENMP
call quad_hh_trafo_real_avx_avx2_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, &
call quad_hh_trafo_real_sse_avx_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, &
nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_real_avx_avx2_4hv(a(1,jj+off+a_off-3,istripe), w, &
call quad_hh_trafo_real_sse_avx_4hv(a(1,jj+off+a_off-3,istripe), w, &
nbw, nl, stripe_width, nbw)
#endif
enddo
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment