Commit 6205cf76 authored by Pavel Kus's avatar Pavel Kus
Browse files

changed vector to Vector in comments to fix problem in gcc preprocessor

on misnky
parent 2933c757
...@@ -358,7 +358,7 @@ __global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuF ...@@ -358,7 +358,7 @@ __global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuF
#endif #endif
int t_idx, v_idx; int t_idx, v_idx;
// The vector index (v_idx) identifies the pair of HH reflectors from which the dot product is computed // The Vector index (v_idx) identifies the pair of HH reflectors from which the dot product is computed
v_idx = blockIdx.x ; v_idx = blockIdx.x ;
// The thread index indicates the position within the two HH reflectors // The thread index indicates the position within the two HH reflectors
......
...@@ -137,7 +137,7 @@ ...@@ -137,7 +137,7 @@
! order, D(I) < D(J) for I < J. ! order, D(I) < D(J) for I < J.
! !
! Z (input) DOUBLE PRECISION array, dimension (N) ! Z (input) DOUBLE PRECISION array, dimension (N)
! The components of the updating vector. ! The components of the updating Vector.
! !
! DELTA (output) DOUBLE PRECISION array, dimension (N) ! DELTA (output) DOUBLE PRECISION array, dimension (N)
! DELTA contains (D(j) - lambda_I) in its j-th component. ! DELTA contains (D(j) - lambda_I) in its j-th component.
......
...@@ -275,8 +275,8 @@ ...@@ -275,8 +275,8 @@
nb = 0 nb = 0
do ic = ics, ice do ic = ics, ice
l_colh = local_index(ic , my_pcol, np_cols, nblk, -1) ! Column of Householder vector l_colh = local_index(ic , my_pcol, np_cols, nblk, -1) ! Column of Householder Vector
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder Vector
if (my_pcol == cur_pcol) then if (my_pcol == cur_pcol) then
...@@ -306,7 +306,7 @@ ...@@ -306,7 +306,7 @@
nb = 0 nb = 0
do ic = ics, ice do ic = ics, ice
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder Vector
hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows) hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows)
if (useGPU) then if (useGPU) then
hvm_ubnd = l_rows hvm_ubnd = l_rows
......
...@@ -174,8 +174,8 @@ ...@@ -174,8 +174,8 @@
#if REALCASE == 1 #if REALCASE == 1
real(kind=REAL_DATATYPE), allocatable :: tmp(:), & real(kind=REAL_DATATYPE), allocatable :: tmp(:), &
v_row(:), & ! used to store calculated Householder vector v_row(:), & ! used to store calculated Householder Vector
v_col(:), & ! the same vector, but transposed - differently distributed among MPI tasks v_col(:), & ! the same Vector, but transposed - differently distributed among MPI tasks
u_row(:), & u_row(:), &
u_col(:) u_col(:)
#endif #endif
...@@ -183,8 +183,8 @@ ...@@ -183,8 +183,8 @@
complex(kind=COMPLEX_DATATYPE), allocatable :: tmp(:), v_row(:), v_col(:), u_row(:), u_col(:) complex(kind=COMPLEX_DATATYPE), allocatable :: tmp(:), v_row(:), v_col(:), u_row(:), u_col(:)
#endif #endif
! the following two matrices store pairs of vectors v and u calculated in each step ! the following two matrices store pairs of vectors v and u calculated in each step
! at most max_stored_uv vector pairs are stored, than the matrix A_i is explicitli updated ! at most max_stored_uv Vector pairs are stored, than the matrix A_i is explicitli updated
! u and v are stored both in row and vector forms ! u and v are stored both in row and Vector forms
! pattern: v1,u1,v2,u2,v3,u3,.... ! pattern: v1,u1,v2,u2,v3,u3,....
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
#if REALCASE == 1 #if REALCASE == 1
...@@ -266,7 +266,7 @@ ...@@ -266,7 +266,7 @@
! allocate memmory for vectors ! allocate memmory for vectors
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
! todo: if something has length max_local_rows, it is actually a column, no? ! todo: if something has length max_local_rows, it is actually a column, no?
! todo: probably one should read it as v_row = vector v distributed among rows ! todo: probably one should read it as v_row = Vector v distributed among rows
! !
allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage) allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_& call check_alloc("tridiag_&
...@@ -359,7 +359,7 @@ ...@@ -359,7 +359,7 @@
endif endif
! main cycle of tridiagonalization ! main cycle of tridiagonalization
! in each step, 1 Householder vector is calculated ! in each step, 1 Householder Vector is calculated
do istep = na, 3 ,-1 do istep = na, 3 ,-1
! Calculate number of local rows and columns of the still remaining matrix ! Calculate number of local rows and columns of the still remaining matrix
...@@ -367,12 +367,12 @@ ...@@ -367,12 +367,12 @@
l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1) l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1) l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
! Calculate vector for Householder transformation on all procs ! Calculate Vector for Householder transformation on all procs
! owning column istep ! owning column istep
if (my_pcol == pcol(istep, nblk, np_cols)) then if (my_pcol == pcol(istep, nblk, np_cols)) then
! Get vector to be transformed; distribute last element and norm of ! Get Vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column ! remaining elements to all procs in current column
! copy l_cols + 1 column of A to v_row ! copy l_cols + 1 column of A to v_row
...@@ -434,7 +434,7 @@ ...@@ -434,7 +434,7 @@
#endif #endif
&PRECISION & &PRECISION &
(vrl, vnorm2, xf, tau(istep)) (vrl, vnorm2, xf, tau(istep))
! Scale v_row and store Householder vector for back transformation ! Scale v_row and store Householder Vector for back transformation
v_row(1:l_rows) = v_row(1:l_rows) * xf v_row(1:l_rows) = v_row(1:l_rows) * xf
if (my_prow == prow(istep-1, nblk, np_rows)) then if (my_prow == prow(istep-1, nblk, np_rows)) then
...@@ -444,7 +444,7 @@ ...@@ -444,7 +444,7 @@
e_vec(istep-1) = vrl e_vec(istep-1) = vrl
endif endif
! store Householder vector for back transformation ! store Householder Vector for back transformation
a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows) a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows)
! add tau after the end of actuall v_row, to be broadcasted with it ! add tau after the end of actuall v_row, to be broadcasted with it
...@@ -452,7 +452,7 @@ ...@@ -452,7 +452,7 @@
endif !(my_pcol == pcol(istep, nblk, np_cols)) endif !(my_pcol == pcol(istep, nblk, np_cols))
#ifdef WITH_MPI #ifdef WITH_MPI
! Broadcast the Householder vector (and tau) along columns ! Broadcast the Householder Vector (and tau) along columns
call MPI_Bcast(v_row, l_rows+1, MPI_MATH_DATATYPE_PRECISION, & call MPI_Bcast(v_row, l_rows+1, MPI_MATH_DATATYPE_PRECISION, &
pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr) pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
#endif /* WITH_MPI */ #endif /* WITH_MPI */
...@@ -460,7 +460,7 @@ ...@@ -460,7 +460,7 @@
!recover tau, which has been broadcasted together with v_row !recover tau, which has been broadcasted together with v_row
tau(istep) = v_row(l_rows+1) tau(istep) = v_row(l_rows+1)
! Transpose Householder vector v_row -> v_col ! Transpose Householder Vector v_row -> v_col
call elpa_transpose_vectors_& call elpa_transpose_vectors_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
...@@ -533,7 +533,7 @@ ...@@ -533,7 +533,7 @@
! multiplication by blocks is efficient only for CPU ! multiplication by blocks is efficient only for CPU
! for GPU we introduced 2 other ways, either by stripes (more simmilar to the original ! for GPU we introduced 2 other ways, either by stripes (more simmilar to the original
! CPU implementation) or by one large matrix vector multiply ! CPU implementation) or by one large matrix Vector multiply
if (.not. useGPU) then if (.not. useGPU) then
call timer%start("blas") call timer%start("blas")
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, & call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
...@@ -755,7 +755,7 @@ ...@@ -755,7 +755,7 @@
#endif #endif
enddo enddo
! We have calculated another Hauseholder vector, number of implicitly stored increased ! We have calculated another Hauseholder Vector, number of implicitly stored increased
n_stored_vecs = n_stored_vecs+1 n_stored_vecs = n_stored_vecs+1
! If the limit of max_stored_uv is reached, calculate A + VU**T + UV**T ! If the limit of max_stored_uv is reached, calculate A + VU**T + UV**T
......
...@@ -646,7 +646,7 @@ ...@@ -646,7 +646,7 @@
#endif /* REALCASE == 1 */ #endif /* REALCASE == 1 */
do lc = n_cols, 1, -1 do lc = n_cols, 1, -1
ncol = istep*nbw + lc ! absolute column number of householder vector ncol = istep*nbw + lc ! absolute column number of householder Vector
nrow = ncol - nbw ! Absolute number of pivot row nrow = ncol - nbw ! Absolute number of pivot row
lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
...@@ -660,10 +660,10 @@ ...@@ -660,10 +660,10 @@
if (my_pcol==cur_pcol) then if (my_pcol==cur_pcol) then
! Get vector to be transformed; distribute last element and norm of ! Get Vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column ! remaining elements to all procs in current column
vr(1:lr) = a(1:lr,lch) ! vector to be transformed vr(1:lr) = a(1:lr,lch) ! Vector to be transformed
if (my_prow==prow(nrow, nblk, np_rows)) then if (my_prow==prow(nrow, nblk, np_rows)) then
aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1)) aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
...@@ -706,7 +706,7 @@ ...@@ -706,7 +706,7 @@
#endif #endif
&PRECISION & &PRECISION &
(vrl, vnorm2, xf, tau) (vrl, vnorm2, xf, tau)
! Scale vr and store Householder vector for back transformation ! Scale vr and store Householder Vector for back transformation
vr(1:lr) = vr(1:lr) * xf vr(1:lr) = vr(1:lr) * xf
if (my_prow==prow(nrow, nblk, np_rows)) then if (my_prow==prow(nrow, nblk, np_rows)) then
...@@ -724,7 +724,7 @@ ...@@ -724,7 +724,7 @@
endif endif
! Broadcast Householder vector and tau along columns ! Broadcast Householder Vector and tau along columns
vr(lr+1) = tau vr(lr+1) = tau
#ifdef WITH_MPI #ifdef WITH_MPI
...@@ -759,7 +759,7 @@ ...@@ -759,7 +759,7 @@
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
#endif #endif
! Transform remaining columns in current block with Householder vector ! Transform remaining columns in current block with Householder Vector
! Local dot product ! Local dot product
#if REALCASE == 1 #if REALCASE == 1
......
...@@ -343,7 +343,7 @@ ...@@ -343,7 +343,7 @@
ab(nb+2:,ns+i-1) = CONST_0_0 ab(nb+2:,ns+i-1) = CONST_0_0
enddo enddo
!send hh-vector !send hh-Vector
if (iblk==nblocks) then if (iblk==nblocks) then
#ifdef WITH_MPI #ifdef WITH_MPI
call timer%start("mpi_communication") call timer%start("mpi_communication")
...@@ -402,7 +402,7 @@ ...@@ -402,7 +402,7 @@
&(nb-nb2,nr,nb2,ab(nb+1-nb2,ns+nb2),2*nb-1,w_new,hv_new,work,nb) &(nb-nb2,nr,nb2,ab(nb+1-nb2,ns+nb2),2*nb-1,w_new,hv_new,work,nb)
endif endif
! Use new HH vector for the next block ! Use new HH Vector for the next block
hv(:,:) = hv_new(:,:) hv(:,:) = hv_new(:,:)
tau = tau_new tau = tau_new
enddo enddo
......
...@@ -344,7 +344,7 @@ ...@@ -344,7 +344,7 @@
ns = 0 ns = 0
do lc = 1, n_cols do lc = 1, n_cols
ncol = istep*nbw + lc ! absolute column number of householder vector ncol = istep*nbw + lc ! absolute column number of householder Vector
nrow = ncol - nbw ! absolute number of pivot row nrow = ncol - nbw ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
...@@ -704,9 +704,9 @@ ...@@ -704,9 +704,9 @@
do lc = 1, n_cols do lc = 1, n_cols
#ifdef BAND_TO_FULL_BLOCKING #ifdef BAND_TO_FULL_BLOCKING
ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder vector ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder Vector
#else #else
ncol = istep*nbw + lc ! absolute column number of householder vector ncol = istep*nbw + lc ! absolute column number of householder Vector
#endif #endif
nrow = ncol - nbw ! absolute number of pivot row nrow = ncol - nbw ! absolute number of pivot row
......
...@@ -1486,7 +1486,7 @@ ...@@ -1486,7 +1486,7 @@
else ! (current_local_n > 1) then else ! (current_local_n > 1) then
! for current_local_n == 1 the one and only HH vector is 0 and not stored in hh_trans_real/complex ! for current_local_n == 1 the one and only HH Vector is 0 and not stored in hh_trans_real/complex
#if REALCASE == 1 #if REALCASE == 1
bcast_buffer(:,1) = CONST_0_0 bcast_buffer(:,1) = CONST_0_0
#endif #endif
......
...@@ -295,7 +295,7 @@ ...@@ -295,7 +295,7 @@
call determine_workload(nx, nb, np_rows, limits) call determine_workload(nx, nb, np_rows, limits)
local_size = limits(my_prow+1) - limits(my_prow) local_size = limits(my_prow+1) - limits(my_prow)
! add to number of householder vectors ! add to number of householder vectors
! please note: for nx==1 the one and only HH vector is 0 and is neither calculated nor send below! ! please note: for nx==1 the one and only HH Vector is 0 and is neither calculated nor send below!
if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
num_hh_vecs = num_hh_vecs + local_size num_hh_vecs = num_hh_vecs + local_size
num_chunks = num_chunks+1 num_chunks = num_chunks+1
...@@ -423,7 +423,7 @@ ...@@ -423,7 +423,7 @@
stop 1 stop 1
endif endif
hh_cnt(:) = 1 ! The first transfomation vector is always 0 and not calculated at all hh_cnt(:) = 1 ! The first transfomation Vector is always 0 and not calculated at all
hh_dst(:) = 0 ! PE number for receive hh_dst(:) = 0 ! PE number for receive
#ifdef WITH_MPI #ifdef WITH_MPI
ireq_ab = MPI_REQUEST_NULL ireq_ab = MPI_REQUEST_NULL
...@@ -589,7 +589,7 @@ ...@@ -589,7 +589,7 @@
endif endif
else else
if (na>na_s) then if (na>na_s) then
! Receive Householder vector from previous task, from PE owning subdiagonal ! Receive Householder Vector from previous task, from PE owning subdiagonal
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
...@@ -701,7 +701,7 @@ ...@@ -701,7 +701,7 @@
hv = hv_t(:,my_thread) hv = hv_t(:,my_thread)
tau = tau_t(my_thread) tau = tau_t(my_thread)
! Store Householder vector for back transformation ! Store Householder Vector for back transformation
hh_cnt(iblk) = hh_cnt(iblk) + 1 hh_cnt(iblk) = hh_cnt(iblk) + 1
...@@ -911,7 +911,7 @@ ...@@ -911,7 +911,7 @@
else else
! We are at the end of all blocks ! We are at the end of all blocks
! Send last HH vector and TAU to next PE if it has been calculated above ! Send last HH Vector and TAU to next PE if it has been calculated above
ne = na_s + nblocks*nb - (max_threads-1) - 1 ne = na_s + nblocks*nb - (max_threads-1) - 1
if (istep>=max_threads .and. ne < na) then if (istep>=max_threads .and. ne < na) then
#ifdef WITH_MPI #ifdef WITH_MPI
...@@ -937,7 +937,7 @@ ...@@ -937,7 +937,7 @@
#endif /* WITH_MPI */ #endif /* WITH_MPI */
endif endif
! "Send" HH vector and TAU to next OpenMP thread ! "Send" HH Vector and TAU to next OpenMP thread
do my_thread = max_threads, 2, -1 do my_thread = max_threads, 2, -1
hv_t(:,my_thread) = hv_t(:,my_thread-1) hv_t(:,my_thread) = hv_t(:,my_thread-1)
tau_t(my_thread) = tau_t(my_thread-1) tau_t(my_thread) = tau_t(my_thread-1)
...@@ -953,7 +953,7 @@ ...@@ -953,7 +953,7 @@
! The following code is structured in a way to keep waiting times for ! The following code is structured in a way to keep waiting times for
! other PEs at a minimum, especially if there is only one block. ! other PEs at a minimum, especially if there is only one block.
! For this reason, it requests the last column as late as possible ! For this reason, it requests the last column as late as possible
! and sends the Householder vector and the first column as early ! and sends the Householder Vector and the first column as early
! as possible. ! as possible.
#endif /* WITH_OPENMP */ #endif /* WITH_OPENMP */
...@@ -964,7 +964,7 @@ ...@@ -964,7 +964,7 @@
if (ns+n_off>na) exit if (ns+n_off>na) exit
! Store Householder vector for back transformation ! Store Householder Vector for back transformation
hh_cnt(iblk) = hh_cnt(iblk) + 1 hh_cnt(iblk) = hh_cnt(iblk) + 1
...@@ -1020,14 +1020,14 @@ ...@@ -1020,14 +1020,14 @@
! The following code is structured in a way to keep waiting times for ! The following code is structured in a way to keep waiting times for
! other PEs at a minimum, especially if there is only one block. ! other PEs at a minimum, especially if there is only one block.
! For this reason, it requests the last column as late as possible ! For this reason, it requests the last column as late as possible
! and sends the Householder vector and the first column as early ! and sends the Householder Vector and the first column as early
! as possible. ! as possible.
#endif /* WITH_OPENMP */ #endif /* WITH_OPENMP */
nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
! Note that nr>=0 implies that diagonal block is full (nc==nb)! ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
! Multiply diagonal block and subdiagonal block with Householder vector ! Multiply diagonal block and subdiagonal block with Householder Vector
if (iblk==nblocks .and. nc==nb) then if (iblk==nblocks .and. nc==nb) then
...@@ -1312,7 +1312,7 @@ ...@@ -1312,7 +1312,7 @@
endif endif
endif endif
! Use new HH vector for the next block ! Use new HH Vector for the next block
hv(:) = hv_new(:) hv(:) = hv_new(:)
tau = tau_new tau = tau_new
......
...@@ -156,7 +156,7 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i ...@@ -156,7 +156,7 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i
/** /**
* Unrolled kernel that computes * Unrolled kernel that computes
* 24 rows of Q simultaneously, a * 24 rows of Q simultaneously, a
* matrix vector product with two householder * matrix Vector product with two householder
* vectors + a rank 2 update is performed * vectors + a rank 2 update is performed
*/ */
__forceinline void hh_trafo_kernel_24_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s) __forceinline void hh_trafo_kernel_24_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
...@@ -470,7 +470,7 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i ...@@ -470,7 +470,7 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i
/** /**
* Unrolled kernel that computes * Unrolled kernel that computes
* 16 rows of Q simultaneously, a * 16 rows of Q simultaneously, a
* matrix vector product with two householder * matrix Vector product with two householder
* vectors + a rank 2 update is performed * vectors + a rank 2 update is performed
*/ */
__forceinline void hh_trafo_kernel_16_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s) __forceinline void hh_trafo_kernel_16_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
...@@ -704,7 +704,7 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i ...@@ -704,7 +704,7 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i
/** /**
* Unrolled kernel that computes * Unrolled kernel that computes
* 8 rows of Q simultaneously, a * 8 rows of Q simultaneously, a
* matrix vector product with two householder * matrix Vector product with two householder
* vectors + a rank 2 update is performed * vectors + a rank 2 update is performed
*/ */
__forceinline void hh_trafo_kernel_8_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s) __forceinline void hh_trafo_kernel_8_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
...@@ -858,7 +858,7 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i ...@@ -858,7 +858,7 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i
/** /**
* Unrolled kernel that computes * Unrolled kernel that computes
* 4 rows of Q simultaneously, a * 4 rows of Q simultaneously, a
* matrix vector product with two householder * matrix Vector product with two householder
* vectors + a rank 2 update is performed * vectors + a rank 2 update is performed
*/ */
__forceinline void hh_trafo_kernel_4_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s) __forceinline void hh_trafo_kernel_4_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
......
...@@ -153,7 +153,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int ...@@ -153,7 +153,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
/** /**
* Unrolled kernel that computes * Unrolled kernel that computes
* 24 rows of Q simultaneously, a * 24 rows of Q simultaneously, a
* matrix vector product with two householder * matrix Vector product with two householder
* vectors + a rank 2 update is performed * vectors + a rank 2 update is performed
*/ */
__forceinline void hh_trafo_kernel_24_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s) __forceinline void hh_trafo_kernel_24_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
...@@ -470,7 +470,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int ...@@ -470,7 +470,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
/** /**
* Unrolled kernel that computes * Unrolled kernel that computes
* 16 rows of Q simultaneously, a * 16 rows of Q simultaneously, a
* matrix vector product with two householder * matrix Vector product with two householder
* vectors + a rank 2 update is performed * vectors + a rank 2 update is performed
*/ */
__forceinline void hh_trafo_kernel_16_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s) __forceinline void hh_trafo_kernel_16_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
...@@ -707,7 +707,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int ...@@ -707,7 +707,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
/** /**
* Unrolled kernel that computes * Unrolled kernel that computes
* 8 rows of Q simultaneously, a * 8 rows of Q simultaneously, a
* matrix vector product with two householder * matrix Vector product with two householder
* vectors + a rank 2 update is performed * vectors + a rank 2 update is performed
*/ */
__forceinline void hh_trafo_kernel_8_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s) __forceinline void hh_trafo_kernel_8_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
...@@ -865,7 +865,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int ...@@ -865,7 +865,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
/** /**
* Unrolled kernel that computes * Unrolled kernel that computes
* 4 rows of Q simultaneously, a * 4 rows of Q simultaneously, a
* matrix vector product with two householder * matrix Vector product with two householder
* vectors + a rank 2 update is performed * vectors + a rank 2 update is performed
*/ */
__forceinline void hh_trafo_kernel_4_sse_instead_of_avx_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s) __forceinline void hh_trafo_kernel_4_sse_instead_of_avx_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
...@@ -965,7 +965,7 @@ __forceinline void hh_trafo_kernel_4_sse_instead_of_avx_2hv_single(float* q, flo ...@@ -965,7 +965,7 @@ __forceinline void hh_trafo_kernel_4_sse_instead_of_avx_2hv_single(float* q, flo
/** /**
* Unrolled kernel that computes * Unrolled kernel that computes
* 4 rows of Q simultaneously, a * 4 rows of Q simultaneously, a
* matrix vector product with two householder * matrix Vector product with two householder
* vectors + a rank 2 update is performed * vectors + a rank 2 update is performed
*/ */
__forceinline void hh_trafo_kernel_4_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)