Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
11
Issues
11
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
6205cf76
Commit
6205cf76
authored
Apr 19, 2017
by
Pavel Kus
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
changed vector to Vector in comments to fix problem in gcc preprocessor
on misnky
parent
2933c757
Changes
31
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
31 changed files
with
223 additions
and
207 deletions
+223
-207
src/GPU/cuUtils_template.Xcu
src/GPU/cuUtils_template.Xcu
+1
-1
src/elpa1/elpa1_tools_template.X90
src/elpa1/elpa1_tools_template.X90
+1
-1
src/elpa1/elpa1_trans_ev_template.X90
src/elpa1/elpa1_trans_ev_template.X90
+3
-3
src/elpa1/elpa1_tridiag_template.X90
src/elpa1/elpa1_tridiag_template.X90
+14
-14
src/elpa2/elpa2_bandred_template.X90
src/elpa2/elpa2_bandred_template.X90
+6
-6
src/elpa2/elpa2_compute_real_template.X90
src/elpa2/elpa2_compute_real_template.X90
+2
-2
src/elpa2/elpa2_trans_ev_band_to_full_template.X90
src/elpa2/elpa2_trans_ev_band_to_full_template.X90
+3
-3
src/elpa2/elpa2_trans_ev_tridi_to_band_template.X90
src/elpa2/elpa2_trans_ev_tridi_to_band_template.X90
+1
-1
src/elpa2/elpa2_tridiag_band_template.X90
src/elpa2/elpa2_tridiag_band_template.X90
+11
-11
src/elpa2/kernels/elpa2_kernels_real_avx-avx2_2hv_double_precision.c
...ernels/elpa2_kernels_real_avx-avx2_2hv_double_precision.c
+4
-4
src/elpa2/kernels/elpa2_kernels_real_avx-avx2_2hv_single_precision.c
...ernels/elpa2_kernels_real_avx-avx2_2hv_single_precision.c
+5
-5
src/elpa2/kernels/elpa2_kernels_real_avx-avx2_4hv_double_precision.c
...ernels/elpa2_kernels_real_avx-avx2_4hv_double_precision.c
+4
-4
src/elpa2/kernels/elpa2_kernels_real_avx-avx2_4hv_single_precision.c
...ernels/elpa2_kernels_real_avx-avx2_4hv_single_precision.c
+5
-5
src/elpa2/kernels/elpa2_kernels_real_avx-avx2_6hv_double_precision.c
...ernels/elpa2_kernels_real_avx-avx2_6hv_double_precision.c
+4
-4
src/elpa2/kernels/elpa2_kernels_real_avx-avx2_6hv_single_precision.c
...ernels/elpa2_kernels_real_avx-avx2_6hv_single_precision.c
+4
-4
src/elpa2/kernels/elpa2_kernels_real_avx512_2hv_double_precision.c
.../kernels/elpa2_kernels_real_avx512_2hv_double_precision.c
+5
-5
src/elpa2/kernels/elpa2_kernels_real_avx512_2hv_single_precision.c
.../kernels/elpa2_kernels_real_avx512_2hv_single_precision.c
+4
-4
src/elpa2/kernels/elpa2_kernels_real_avx512_4hv_double_precision.c
.../kernels/elpa2_kernels_real_avx512_4hv_double_precision.c
+6
-6
src/elpa2/kernels/elpa2_kernels_real_avx512_4hv_single_precision.c
.../kernels/elpa2_kernels_real_avx512_4hv_single_precision.c
+4
-4
src/elpa2/kernels/elpa2_kernels_real_avx512_6hv_double_precision.c
.../kernels/elpa2_kernels_real_avx512_6hv_double_precision.c
+6
-6
src/elpa2/kernels/elpa2_kernels_real_avx512_6hv_single_precision.c
.../kernels/elpa2_kernels_real_avx512_6hv_single_precision.c
+3
-3
src/elpa2/kernels/elpa2_kernels_real_sse_2hv_double_precision.c
...pa2/kernels/elpa2_kernels_real_sse_2hv_double_precision.c
+3
-3
src/elpa2/kernels/elpa2_kernels_real_sse_2hv_single_precision.c
...pa2/kernels/elpa2_kernels_real_sse_2hv_single_precision.c
+3
-3
src/elpa2/kernels/elpa2_kernels_real_sse_4hv_double_precision.c
...pa2/kernels/elpa2_kernels_real_sse_4hv_double_precision.c
+5
-5
src/elpa2/kernels/elpa2_kernels_real_sse_4hv_single_precision.c
...pa2/kernels/elpa2_kernels_real_sse_4hv_single_precision.c
+6
-6
src/elpa2/kernels/elpa2_kernels_real_sse_6hv_double_precision.c
...pa2/kernels/elpa2_kernels_real_sse_6hv_double_precision.c
+4
-4
src/elpa2/kernels/elpa2_kernels_real_sse_6hv_single_precision.c
...pa2/kernels/elpa2_kernels_real_sse_6hv_single_precision.c
+4
-4
src/elpa2/legacy_interface/elpa_2stage_c_interface_legacy.F90
...elpa2/legacy_interface/elpa_2stage_c_interface_legacy.F90
+16
-8
src/elpa2/qr/elpa_pdgeqrf.F90
src/elpa2/qr/elpa_pdgeqrf.F90
+64
-64
src/elpa2/qr/elpa_pdlarfb.F90
src/elpa2/qr/elpa_pdlarfb.F90
+6
-6
src/elpa_driver/legacy_interface/elpa_driver_c_interface_legacy.F90
...river/legacy_interface/elpa_driver_c_interface_legacy.F90
+16
-8
No files found.
src/GPU/cuUtils_template.Xcu
View file @
6205cf76
...
...
@@ -358,7 +358,7 @@ __global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuF
#endif
int t_idx, v_idx;
// The
v
ector index (v_idx) identifies the pair of HH reflectors from which the dot product is computed
// The
V
ector index (v_idx) identifies the pair of HH reflectors from which the dot product is computed
v_idx = blockIdx.x ;
// The thread index indicates the position within the two HH reflectors
...
...
src/elpa1/elpa1_tools_template.X90
View file @
6205cf76
...
...
@@ -137,7 +137,7 @@
! order, D(I) < D(J) for I < J.
!
! Z (input) DOUBLE PRECISION array, dimension (N)
! The components of the updating
v
ector.
! The components of the updating
V
ector.
!
! DELTA (output) DOUBLE PRECISION array, dimension (N)
! DELTA contains (D(j) - lambda_I) in its j-th component.
...
...
src/elpa1/elpa1_trans_ev_template.X90
View file @
6205cf76
...
...
@@ -275,8 +275,8 @@
nb = 0
do ic = ics, ice
l_colh = local_index(ic , my_pcol, np_cols, nblk, -1) ! Column of Householder
v
ector
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder
v
ector
l_colh = local_index(ic , my_pcol, np_cols, nblk, -1) ! Column of Householder
V
ector
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder
V
ector
if (my_pcol == cur_pcol) then
...
...
@@ -306,7 +306,7 @@
nb = 0
do ic = ics, ice
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder
v
ector
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder
V
ector
hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows)
if (useGPU) then
hvm_ubnd = l_rows
...
...
src/elpa1/elpa1_tridiag_template.X90
View file @
6205cf76
...
...
@@ -174,8 +174,8 @@
#if REALCASE == 1
real(kind=REAL_DATATYPE), allocatable :: tmp(:), &
v_row(:), & ! used to store calculated Householder
v
ector
v_col(:), & ! the same
v
ector, but transposed - differently distributed among MPI tasks
v_row(:), & ! used to store calculated Householder
V
ector
v_col(:), & ! the same
V
ector, but transposed - differently distributed among MPI tasks
u_row(:), &
u_col(:)
#endif
...
...
@@ -183,8 +183,8 @@
complex(kind=COMPLEX_DATATYPE), allocatable :: tmp(:), v_row(:), v_col(:), u_row(:), u_col(:)
#endif
! the following two matrices store pairs of vectors v and u calculated in each step
! at most max_stored_uv
v
ector pairs are stored, than the matrix A_i is explicitli updated
! u and v are stored both in row and
v
ector forms
! at most max_stored_uv
V
ector pairs are stored, than the matrix A_i is explicitli updated
! u and v are stored both in row and
V
ector forms
! pattern: v1,u1,v2,u2,v3,u3,....
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
#if REALCASE == 1
...
...
@@ -266,7 +266,7 @@
! allocate memmory for vectors
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
! todo: if something has length max_local_rows, it is actually a column, no?
! todo: probably one should read it as v_row =
v
ector v distributed among rows
! todo: probably one should read it as v_row =
V
ector v distributed among rows
!
allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
...
...
@@ -359,7 +359,7 @@
endif
! main cycle of tridiagonalization
! in each step, 1 Householder
v
ector is calculated
! in each step, 1 Householder
V
ector is calculated
do istep = na, 3 ,-1
! Calculate number of local rows and columns of the still remaining matrix
...
...
@@ -367,12 +367,12 @@
l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
! Calculate
v
ector for Householder transformation on all procs
! Calculate
V
ector for Householder transformation on all procs
! owning column istep
if (my_pcol == pcol(istep, nblk, np_cols)) then
! Get
v
ector to be transformed; distribute last element and norm of
! Get
V
ector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column
! copy l_cols + 1 column of A to v_row
...
...
@@ -434,7 +434,7 @@
#endif
&PRECISION &
(vrl, vnorm2, xf, tau(istep))
! Scale v_row and store Householder
v
ector for back transformation
! Scale v_row and store Householder
V
ector for back transformation
v_row(1:l_rows) = v_row(1:l_rows) * xf
if (my_prow == prow(istep-1, nblk, np_rows)) then
...
...
@@ -444,7 +444,7 @@
e_vec(istep-1) = vrl
endif
! store Householder
v
ector for back transformation
! store Householder
V
ector for back transformation
a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows)
! add tau after the end of actuall v_row, to be broadcasted with it
...
...
@@ -452,7 +452,7 @@
endif !(my_pcol == pcol(istep, nblk, np_cols))
#ifdef WITH_MPI
! Broadcast the Householder
v
ector (and tau) along columns
! Broadcast the Householder
V
ector (and tau) along columns
call MPI_Bcast(v_row, l_rows+1, MPI_MATH_DATATYPE_PRECISION, &
pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
#endif /* WITH_MPI */
...
...
@@ -460,7 +460,7 @@
!recover tau, which has been broadcasted together with v_row
tau(istep) = v_row(l_rows+1)
! Transpose Householder
v
ector v_row -> v_col
! Transpose Householder
V
ector v_row -> v_col
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
...
...
@@ -533,7 +533,7 @@
! multiplication by blocks is efficient only for CPU
! for GPU we introduced 2 other ways, either by stripes (more simmilar to the original
! CPU implementation) or by one large matrix
v
ector multiply
! CPU implementation) or by one large matrix
V
ector multiply
if (.not. useGPU) then
call timer%start("blas")
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
...
...
@@ -755,7 +755,7 @@
#endif
enddo
! We have calculated another Hauseholder
v
ector, number of implicitly stored increased
! We have calculated another Hauseholder
V
ector, number of implicitly stored increased
n_stored_vecs = n_stored_vecs+1
! If the limit of max_stored_uv is reached, calculate A + VU**T + UV**T
...
...
src/elpa2/elpa2_bandred_template.X90
View file @
6205cf76
...
...
@@ -646,7 +646,7 @@
#endif /* REALCASE == 1 */
do lc = n_cols, 1, -1
ncol = istep*nbw + lc ! absolute column number of householder
v
ector
ncol = istep*nbw + lc ! absolute column number of householder
V
ector
nrow = ncol - nbw ! Absolute number of pivot row
lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
...
...
@@ -660,10 +660,10 @@
if (my_pcol==cur_pcol) then
! Get
v
ector to be transformed; distribute last element and norm of
! Get
V
ector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column
vr(1:lr) = a(1:lr,lch) !
v
ector to be transformed
vr(1:lr) = a(1:lr,lch) !
V
ector to be transformed
if (my_prow==prow(nrow, nblk, np_rows)) then
aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
...
...
@@ -706,7 +706,7 @@
#endif
&PRECISION &
(vrl, vnorm2, xf, tau)
! Scale vr and store Householder
v
ector for back transformation
! Scale vr and store Householder
V
ector for back transformation
vr(1:lr) = vr(1:lr) * xf
if (my_prow==prow(nrow, nblk, np_rows)) then
...
...
@@ -724,7 +724,7 @@
endif
! Broadcast Householder
v
ector and tau along columns
! Broadcast Householder
V
ector and tau along columns
vr(lr+1) = tau
#ifdef WITH_MPI
...
...
@@ -759,7 +759,7 @@
#if COMPLEXCASE == 1
tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
#endif
! Transform remaining columns in current block with Householder
v
ector
! Transform remaining columns in current block with Householder
V
ector
! Local dot product
#if REALCASE == 1
...
...
src/elpa2/elpa2_compute_real_template.X90
View file @
6205cf76
...
...
@@ -343,7 +343,7 @@
ab(nb+2:,ns+i-1) = CONST_0_0
enddo
!send hh-
v
ector
!send hh-
V
ector
if (iblk==nblocks) then
#ifdef WITH_MPI
call timer%start("mpi_communication")
...
...
@@ -402,7 +402,7 @@
&(nb-nb2,nr,nb2,ab(nb+1-nb2,ns+nb2),2*nb-1,w_new,hv_new,work,nb)
endif
! Use new HH
v
ector for the next block
! Use new HH
V
ector for the next block
hv(:,:) = hv_new(:,:)
tau = tau_new
enddo
...
...
src/elpa2/elpa2_trans_ev_band_to_full_template.X90
View file @
6205cf76
...
...
@@ -344,7 +344,7 @@
ns = 0
do lc = 1, n_cols
ncol = istep*nbw + lc ! absolute column number of householder
v
ector
ncol = istep*nbw + lc ! absolute column number of householder
V
ector
nrow = ncol - nbw ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
...
...
@@ -704,9 +704,9 @@
do lc = 1, n_cols
#ifdef BAND_TO_FULL_BLOCKING
ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder
v
ector
ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder
V
ector
#else
ncol = istep*nbw + lc ! absolute column number of householder
v
ector
ncol = istep*nbw + lc ! absolute column number of householder
V
ector
#endif
nrow = ncol - nbw ! absolute number of pivot row
...
...
src/elpa2/elpa2_trans_ev_tridi_to_band_template.X90
View file @
6205cf76
...
...
@@ -1486,7 +1486,7 @@
else ! (current_local_n > 1) then
! for current_local_n == 1 the one and only HH
v
ector is 0 and not stored in hh_trans_real/complex
! for current_local_n == 1 the one and only HH
V
ector is 0 and not stored in hh_trans_real/complex
#if REALCASE == 1
bcast_buffer(:,1) = CONST_0_0
#endif
...
...
src/elpa2/elpa2_tridiag_band_template.X90
View file @
6205cf76
...
...
@@ -295,7 +295,7 @@
call determine_workload(nx, nb, np_rows, limits)
local_size = limits(my_prow+1) - limits(my_prow)
! add to number of householder vectors
! please note: for nx==1 the one and only HH
v
ector is 0 and is neither calculated nor send below!
! please note: for nx==1 the one and only HH
V
ector is 0 and is neither calculated nor send below!
if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
num_hh_vecs = num_hh_vecs + local_size
num_chunks = num_chunks+1
...
...
@@ -423,7 +423,7 @@
stop 1
endif
hh_cnt(:) = 1 ! The first transfomation
v
ector is always 0 and not calculated at all
hh_cnt(:) = 1 ! The first transfomation
V
ector is always 0 and not calculated at all
hh_dst(:) = 0 ! PE number for receive
#ifdef WITH_MPI
ireq_ab = MPI_REQUEST_NULL
...
...
@@ -589,7 +589,7 @@
endif
else
if (na>na_s) then
! Receive Householder
v
ector from previous task, from PE owning subdiagonal
! Receive Householder
V
ector from previous task, from PE owning subdiagonal
#ifdef WITH_OPENMP
...
...
@@ -701,7 +701,7 @@
hv = hv_t(:,my_thread)
tau = tau_t(my_thread)
! Store Householder
v
ector for back transformation
! Store Householder
V
ector for back transformation
hh_cnt(iblk) = hh_cnt(iblk) + 1
...
...
@@ -911,7 +911,7 @@
else
! We are at the end of all blocks
! Send last HH
v
ector and TAU to next PE if it has been calculated above
! Send last HH
V
ector and TAU to next PE if it has been calculated above
ne = na_s + nblocks*nb - (max_threads-1) - 1
if (istep>=max_threads .and. ne < na) then
#ifdef WITH_MPI
...
...
@@ -937,7 +937,7 @@
#endif /* WITH_MPI */
endif
! "Send" HH
v
ector and TAU to next OpenMP thread
! "Send" HH
V
ector and TAU to next OpenMP thread
do my_thread = max_threads, 2, -1
hv_t(:,my_thread) = hv_t(:,my_thread-1)
tau_t(my_thread) = tau_t(my_thread-1)
...
...
@@ -953,7 +953,7 @@
! The following code is structured in a way to keep waiting times for
! other PEs at a minimum, especially if there is only one block.
! For this reason, it requests the last column as late as possible
! and sends the Householder
v
ector and the first column as early
! and sends the Householder
V
ector and the first column as early
! as possible.
#endif /* WITH_OPENMP */
...
...
@@ -964,7 +964,7 @@
if (ns+n_off>na) exit
! Store Householder
v
ector for back transformation
! Store Householder
V
ector for back transformation
hh_cnt(iblk) = hh_cnt(iblk) + 1
...
...
@@ -1020,14 +1020,14 @@
! The following code is structured in a way to keep waiting times for
! other PEs at a minimum, especially if there is only one block.
! For this reason, it requests the last column as late as possible
! and sends the Householder
v
ector and the first column as early
! and sends the Householder
V
ector and the first column as early
! as possible.
#endif /* WITH_OPENMP */
nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
! Note that nr>=0 implies that diagonal block is full (nc==nb)!
! Multiply diagonal block and subdiagonal block with Householder
v
ector
! Multiply diagonal block and subdiagonal block with Householder
V
ector
if (iblk==nblocks .and. nc==nb) then
...
...
@@ -1312,7 +1312,7 @@
endif
endif
! Use new HH
v
ector for the next block
! Use new HH
V
ector for the next block
hv(:) = hv_new(:)
tau = tau_new
...
...
src/elpa2/kernels/elpa2_kernels_real_avx-avx2_2hv_double_precision.c
View file @
6205cf76
...
...
@@ -156,7 +156,7 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i
/**
* Unrolled kernel that computes
* 24 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline
void
hh_trafo_kernel_24_AVX_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
...
...
@@ -470,7 +470,7 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i
/**
* Unrolled kernel that computes
* 16 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline
void
hh_trafo_kernel_16_AVX_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
...
...
@@ -704,7 +704,7 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i
/**
* Unrolled kernel that computes
* 8 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline
void
hh_trafo_kernel_8_AVX_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
...
...
@@ -858,7 +858,7 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline
void
hh_trafo_kernel_4_AVX_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
...
...
src/elpa2/kernels/elpa2_kernels_real_avx-avx2_2hv_single_precision.c
View file @
6205cf76
...
...
@@ -153,7 +153,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 24 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline
void
hh_trafo_kernel_24_AVX_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
...
...
@@ -470,7 +470,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 16 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline
void
hh_trafo_kernel_16_AVX_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
...
...
@@ -707,7 +707,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 8 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline
void
hh_trafo_kernel_8_AVX_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
...
...
@@ -865,7 +865,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_4_sse_instead_of_avx_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
...
...
@@ -965,7 +965,7 @@ __forceinline void hh_trafo_kernel_4_sse_instead_of_avx_2hv_single(float* q, flo
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline
void
hh_trafo_kernel_4_AVX_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
...
...
src/elpa2/kernels/elpa2_kernels_real_avx-avx2_4hv_double_precision.c
View file @
6205cf76
...
...
@@ -122,7 +122,7 @@ void quad_hh_trafo_real_avx_avx2_4hv_double(double* q, double* hh, int* pnb, int
double
s_2_4
=
hh
[(
ldh
*
3
)
+
2
];
double
s_3_4
=
hh
[(
ldh
*
3
)
+
1
];
// calculate scalar product of first and fourth householder
v
ector
// calculate scalar product of first and fourth householder
V
ector
// loop counter = 2
s_1_2
+=
hh
[
2
-
1
]
*
hh
[(
2
+
ldh
)];
s_2_3
+=
hh
[(
ldh
)
+
2
-
1
]
*
hh
[
2
+
(
ldh
*
2
)];
...
...
@@ -195,7 +195,7 @@ void quad_hh_trafo_real_avx_avx2_4hv_double(double* q, double* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 12 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline
void
hh_trafo_kernel_12_AVX_4hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s_1_2
,
double
s_1_3
,
double
s_2_3
,
double
s_1_4
,
double
s_2_4
,
double
s_3_4
)
...
...
@@ -723,7 +723,7 @@ __forceinline void hh_trafo_kernel_12_AVX_4hv_double(double* q, double* hh, int
/**
* Unrolled kernel that computes
* 8 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline
void
hh_trafo_kernel_8_AVX_4hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s_1_2
,
double
s_1_3
,
double
s_2_3
,
double
s_1_4
,
double
s_2_4
,
double
s_3_4
)
...
...
@@ -1078,7 +1078,7 @@ __forceinline void hh_trafo_kernel_8_AVX_4hv_double(double* q, double* hh, int n
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline
void
hh_trafo_kernel_4_AVX_4hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s_1_2
,
double
s_1_3
,
double
s_2_3
,
double
s_1_4
,
double
s_2_4
,
double
s_3_4
)
...
...
src/elpa2/kernels/elpa2_kernels_real_avx-avx2_4hv_single_precision.c
View file @
6205cf76
...
...
@@ -108,7 +108,7 @@ void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int*
float
s_2_4
=
hh
[(
ldh
*
3
)
+
2
];
float
s_3_4
=
hh
[(
ldh
*
3
)
+
1
];
// calculate scalar product of first and fourth householder
v
ector
// calculate scalar product of first and fourth householder
V
ector
// loop counter = 2
s_1_2
+=
hh
[
2
-
1
]
*
hh
[(
2
+
ldh
)];
s_2_3
+=
hh
[(
ldh
)
+
2
-
1
]
*
hh
[
2
+
(
ldh
*
2
)];
...
...
@@ -190,7 +190,7 @@ void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int*
/**
* Unrolled kernel that computes
* 24 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline
void
hh_trafo_kernel_24_AVX_4hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s_1_2
,
float
s_1_3
,
float
s_2_3
,
float
s_1_4
,
float
s_2_4
,
float
s_3_4
)
...
...
@@ -737,7 +737,7 @@ __forceinline void hh_trafo_kernel_24_AVX_4hv_single(float* q, float* hh, int nb
/**
* Unrolled kernel that computes
* 16 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline
void
hh_trafo_kernel_16_AVX_4hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s_1_2
,
float
s_1_3
,
float
s_2_3
,
float
s_1_4
,
float
s_2_4
,
float
s_3_4
)
...
...
@@ -1092,7 +1092,7 @@ __forceinline void hh_trafo_kernel_16_AVX_4hv_single(float* q, float* hh, int nb
/**
* Unrolled kernel that computes
* 8 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline
void
hh_trafo_kernel_8_AVX_4hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s_1_2
,
float
s_1_3
,
float
s_2_3
,
float
s_1_4
,
float
s_2_4
,
float
s_3_4
)
...
...
@@ -1447,7 +1447,7 @@ __forceinline void hh_trafo_kernel_8_AVX_4hv_single(float* q, float* hh, int nb,
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline
void
hh_trafo_kernel_4_AVX_4hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s_1_2
,
float
s_1_3
,
float
s_2_3
,
float
s_1_4
,
float
s_2_4
,
float
s_3_4
)
...
...
src/elpa2/kernels/elpa2_kernels_real_avx-avx2_6hv_double_precision.c
View file @
6205cf76
...
...
@@ -147,7 +147,7 @@ void hexa_hh_trafo_real_avx_avx2_6hv_double(double* q, double* hh, int* pnb, int
scalarprods
[
13
]
=
hh
[(
ldh
*
5
)
+
2
];
scalarprods
[
14
]
=
hh
[(
ldh
*
5
)
+
1
];
// calculate scalar product of first and fourth householder
v
ector
// calculate scalar product of first and fourth householder
V
ector
// loop counter = 2
scalarprods
[
0
]
+=
hh
[
1
]
*
hh
[(
2
+
ldh
)];
scalarprods
[
2
]
+=
hh
[(
ldh
)
+
1
]
*
hh
[
2
+
(
ldh
*
2
)];
...
...
@@ -317,7 +317,7 @@ void hexa_hh_trafo_real_avx_avx2_6hv_double(double* q, double* hh, int* pnb, int
// scalarprods[13] = hh[(ldh*5)+2];
// scalarprods[14] = hh[(ldh*5)+1];
//
// // calculate scalar product of first and fourth householder
v
ector
// // calculate scalar product of first and fourth householder
V
ector
// // loop counter = 2
// scalarprods[0] += hh[1] * hh[(2+ldh)];
// scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
...
...
@@ -415,7 +415,7 @@ void hexa_hh_trafo_real_avx_avx2_6hv_double(double* q, double* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 8 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline
void
hh_trafo_kernel_8_AVX_6hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
*
scalarprods
)
...
...
@@ -1188,7 +1188,7 @@ __forceinline void hh_trafo_kernel_8_AVX_6hv_double(double* q, double* hh, int n
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline
void
hh_trafo_kernel_4_AVX_6hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
*
scalarprods
)
...
...
src/elpa2/kernels/elpa2_kernels_real_avx-avx2_6hv_single_precision.c
View file @
6205cf76
...
...
@@ -117,7 +117,7 @@ void hexa_hh_trafo_real_avx_avx2_6hv_single(float* q, float* hh, int* pnb, int*
scalarprods
[
13
]
=
hh
[(
ldh
*
5
)
+
2
];
scalarprods
[
14
]
=
hh
[(
ldh
*
5
)
+
1
];
// calculate scalar product of first and fourth householder
v
ector
// calculate scalar product of first and fourth householder
V
ector
// loop counter = 2
scalarprods
[
0
]
+=
hh
[
1
]
*
hh
[(
2
+
ldh
)];
scalarprods
[
2
]
+=
hh
[(
ldh
)
+
1
]
*
hh
[
2
+
(
ldh
*
2
)];
...
...
@@ -215,7 +215,7 @@ void hexa_hh_trafo_real_avx_avx2_6hv_single(float* q, float* hh, int* pnb, int*
/**
* Unrolled kernel that computes
* 16 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline
void
hh_trafo_kernel_16_AVX_6hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
*
scalarprods
)
...
...
@@ -988,7 +988,7 @@ __forceinline void hh_trafo_kernel_16_AVX_6hv_single(float* q, float* hh, int nb
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 1 upsate is performed
*/
__forceinline
void
hh_trafo_kernel_4_AVX_6hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
*
scalarprods
)
...
...
@@ -1559,7 +1559,7 @@ __forceinline void hh_trafo_kernel_4_AVX_6hv_single(float* q, float* hh, int nb,
/**
* Unrolled kernel that computes
* 8 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline
void
hh_trafo_kernel_8_AVX_6hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
*
scalarprods
)
...
...
src/elpa2/kernels/elpa2_kernels_real_avx512_2hv_double_precision.c
View file @
6205cf76
...
...
@@ -123,7 +123,7 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 32 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline
void
hh_trafo_kernel_32_AVX512_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
...
...
@@ -282,7 +282,7 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 24 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline
void
hh_trafo_kernel_24_AVX512_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
...
...
@@ -414,7 +414,7 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 16 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline
void
hh_trafo_kernel_16_AVX512_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
...
...
@@ -521,7 +521,7 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 8 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline
void
hh_trafo_kernel_8_AVX512_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
...
...
@@ -606,7 +606,7 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_4_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
...
...
src/elpa2/kernels/elpa2_kernels_real_avx512_2hv_single_precision.c
View file @
6205cf76
...
...
@@ -126,7 +126,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
/**
* Unrolled kernel that computes
* 64 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline
void
hh_trafo_kernel_64_AVX512_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
...
...
@@ -283,7 +283,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
/**
* Unrolled kernel that computes
* 48 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline
void
hh_trafo_kernel_48_AVX512_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
...
...
@@ -441,7 +441,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
/**
* Unrolled kernel that computes
* 32 rows of Q simultaneously, a
* matrix
v
ector product with two householder
* matrix
V
ector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline
void
hh_trafo_kernel_32_AVX512_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
...
...
@@ -599,7 +599,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
/**