Commit 59e405e0 authored by Lorenz Hüdepohl's avatar Lorenz Hüdepohl

AVX kernels need aligned memory

For the Intel compiler, this was assured with the pragma

  !DEC$ ATTRIBUTES ALIGN: 64:: a

however, other compilers such as gcc of course did not honour this,
which could result in SIGSEGVs in case the variable was not aligned to
32 bytes (by chance!).

This fixes issue #11, thanks to Nico Holmberg for reporting this.
parent a37d4d3a
...@@ -21,6 +21,7 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \ ...@@ -21,6 +21,7 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \
src/mod_compute_hh_trafo_real.F90 \ src/mod_compute_hh_trafo_real.F90 \
src/mod_compute_hh_trafo_complex.F90 \ src/mod_compute_hh_trafo_complex.F90 \
src/mod_pack_unpack_complex.F90 \ src/mod_pack_unpack_complex.F90 \
src/aligned_mem.F90 \
src/elpa2_compute.F90 \ src/elpa2_compute.F90 \
src/elpa2.F90 \ src/elpa2.F90 \
src/elpa_c_interface.F90 \ src/elpa_c_interface.F90 \
......
module aligned_mem
use, intrinsic :: iso_c_binding
interface
function posix_memalign(memptr, alignment, size) result(error) bind(C, name="posix_memalign")
import c_int, c_size_t, c_ptr
integer(kind=c_int) :: error
type(c_ptr), intent(inout) :: memptr
integer(kind=c_size_t), intent(in), value :: alignment, size
end function
end interface
interface
subroutine free(ptr) bind(C, name="free")
import c_ptr
type(c_ptr), value :: ptr
end subroutine
end interface
end module
...@@ -72,6 +72,7 @@ module ELPA2_compute ...@@ -72,6 +72,7 @@ module ELPA2_compute
use elpa2_utilities use elpa2_utilities
use elpa_pdgeqrf use elpa_pdgeqrf
use elpa_mpi use elpa_mpi
use aligned_mem
implicit none implicit none
...@@ -93,6 +94,7 @@ module ELPA2_compute ...@@ -93,6 +94,7 @@ module ELPA2_compute
integer, public :: which_qr_decomposition = 1 ! defines, which QR-decomposition algorithm will be used integer, public :: which_qr_decomposition = 1 ! defines, which QR-decomposition algorithm will be used
! 0 for unblocked ! 0 for unblocked
! 1 for blocked (maxrank: nblk) ! 1 for blocked (maxrank: nblk)
contains contains
subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, & subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
...@@ -1776,10 +1778,12 @@ module ELPA2_compute ...@@ -1776,10 +1778,12 @@ module ELPA2_compute
logical :: flag logical :: flag
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
real(kind=rk), allocatable :: a(:,:,:,:), row(:) real(kind=rk), pointer :: a(:,:,:,:)
#else #else
real(kind=rk), allocatable :: a(:,:,:), row(:) real(kind=rk), pointer :: a(:,:,:)
#endif #endif
type(c_ptr) :: a_ptr
real(kind=rk), allocatable :: row(:)
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
real(kind=rk), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:) real(kind=rk), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
...@@ -1891,13 +1895,28 @@ module ELPA2_compute ...@@ -1891,13 +1895,28 @@ module ELPA2_compute
a_dim2 = max_blk_size + nbw a_dim2 = max_blk_size + nbw
!DEC$ ATTRIBUTES ALIGN: 64:: a
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
allocate(a(stripe_width,a_dim2,stripe_count,max_threads)) if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a(1,1,1,1))) /= 0) then
! a(:,:,:,:) should be set to 0 in a parallel region, not here!
#else #else
allocate(a(stripe_width,a_dim2,stripe_count)) if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a(1,1,1))) /= 0) then
#endif
write(error_unit,*) "Cannot allocate memory"
success = .false.
return
endif
call c_f_pointer(a_ptr, a, &
#ifdef WITH_OPENMP
[stripe_width,a_dim2,stripe_count,max_threads] &
#else
[stripe_width,a_dim2,stripe_count] &
#endif
)
#ifndef WITH_OPENMP
a(:,:,:) = 0 a(:,:,:) = 0
#else
! a(:,:,:,:) should be set to 0 in a parallel region, not here!
#endif #endif
allocate(row(l_nev)) allocate(row(l_nev))
...@@ -2766,7 +2785,8 @@ top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, & ...@@ -2766,7 +2785,8 @@ top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
! deallocate all working space ! deallocate all working space
deallocate(a) nullify(a)
call free(a_ptr)
deallocate(row) deallocate(row)
deallocate(limits) deallocate(limits)
deallocate(result_send_request) deallocate(result_send_request)
...@@ -4182,10 +4202,13 @@ top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, & ...@@ -4182,10 +4202,13 @@ top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
logical :: flag logical :: flag
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
complex(kind=ck), allocatable :: a(:,:,:,:), row(:) complex(kind=ck), pointer :: a(:,:,:,:)
#else #else
complex(kind=ck), allocatable :: a(:,:,:), row(:) complex(kind=ck), pointer :: a(:,:,:)
#endif #endif
complex(kind=ck), allocatable :: row(:)
type(c_ptr) :: a_ptr
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
complex(kind=ck), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:) complex(kind=ck), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
complex(kind=ck), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:) complex(kind=ck), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
...@@ -4305,12 +4328,25 @@ top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, & ...@@ -4305,12 +4328,25 @@ top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
a_dim2 = max_blk_size + nbw a_dim2 = max_blk_size + nbw
!DEC$ ATTRIBUTES ALIGN: 64:: a
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
allocate(a(stripe_width,a_dim2,stripe_count,max_threads)) if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a(1,1,1,1))) /= 0) then
! a(:,:,:,:) should be set to 0 in a parallel region, not here!
#else #else
allocate(a(stripe_width,a_dim2,stripe_count)) if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a(1,1,1))) /= 0) then
#endif
write(error_unit,*) "Cannot allocate memory"
success = .false.
return
endif
call c_f_pointer(a_ptr, a, &
#ifdef WITH_OPENMP
[stripe_width,a_dim2,stripe_count,max_threads] &
#else
[stripe_width,a_dim2,stripe_count] &
#endif
)
#ifndef WITH_OPENMP
a(:,:,:) = 0 a(:,:,:) = 0
#endif #endif
...@@ -4339,8 +4375,8 @@ top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, & ...@@ -4339,8 +4375,8 @@ top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel") call timer%stop("OpenMP parallel")
#endif #endif
#endif #endif
do ip = np_rows-1, 0, -1 do ip = np_rows-1, 0, -1
if (my_prow == ip) then if (my_prow == ip) then
! Receive my rows which have not yet been received ! Receive my rows which have not yet been received
...@@ -5240,7 +5276,8 @@ top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, & ...@@ -5240,7 +5276,8 @@ top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
! deallocate all working space ! deallocate all working space
deallocate(a) nullify(a)
call free(a_ptr)
deallocate(row) deallocate(row)
deallocate(limits) deallocate(limits)
deallocate(result_send_request) deallocate(result_send_request)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment