Commit 6205cf76 authored by Pavel Kus's avatar Pavel Kus

changed vector to Vector in comments to fix problem in gcc preprocessor

on misnky
parent 2933c757
......@@ -358,7 +358,7 @@ __global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuF
#endif
int t_idx, v_idx;
// The vector index (v_idx) identifies the pair of HH reflectors from which the dot product is computed
// The Vector index (v_idx) identifies the pair of HH reflectors from which the dot product is computed
v_idx = blockIdx.x ;
// The thread index indicates the position within the two HH reflectors
......
......@@ -137,7 +137,7 @@
! order, D(I) < D(J) for I < J.
!
! Z (input) DOUBLE PRECISION array, dimension (N)
! The components of the updating vector.
! The components of the updating Vector.
!
! DELTA (output) DOUBLE PRECISION array, dimension (N)
! DELTA contains (D(j) - lambda_I) in its j-th component.
......
......@@ -275,8 +275,8 @@
nb = 0
do ic = ics, ice
l_colh = local_index(ic , my_pcol, np_cols, nblk, -1) ! Column of Householder vector
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
l_colh = local_index(ic , my_pcol, np_cols, nblk, -1) ! Column of Householder Vector
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder Vector
if (my_pcol == cur_pcol) then
......@@ -306,7 +306,7 @@
nb = 0
do ic = ics, ice
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder Vector
hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows)
if (useGPU) then
hvm_ubnd = l_rows
......
......@@ -174,8 +174,8 @@
#if REALCASE == 1
real(kind=REAL_DATATYPE), allocatable :: tmp(:), &
v_row(:), & ! used to store calculated Householder vector
v_col(:), & ! the same vector, but transposed - differently distributed among MPI tasks
v_row(:), & ! used to store calculated Householder Vector
v_col(:), & ! the same Vector, but transposed - differently distributed among MPI tasks
u_row(:), &
u_col(:)
#endif
......@@ -183,8 +183,8 @@
complex(kind=COMPLEX_DATATYPE), allocatable :: tmp(:), v_row(:), v_col(:), u_row(:), u_col(:)
#endif
! the following two matrices store pairs of vectors v and u calculated in each step
! at most max_stored_uv vector pairs are stored, than the matrix A_i is explicitli updated
! u and v are stored both in row and vector forms
! at most max_stored_uv Vector pairs are stored, than the matrix A_i is explicitli updated
! u and v are stored both in row and Vector forms
! pattern: v1,u1,v2,u2,v3,u3,....
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
#if REALCASE == 1
......@@ -266,7 +266,7 @@
! allocate memmory for vectors
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
! todo: if something has length max_local_rows, it is actually a column, no?
! todo: probably one should read it as v_row = vector v distributed among rows
! todo: probably one should read it as v_row = Vector v distributed among rows
!
allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
......@@ -359,7 +359,7 @@
endif
! main cycle of tridiagonalization
! in each step, 1 Householder vector is calculated
! in each step, 1 Householder Vector is calculated
do istep = na, 3 ,-1
! Calculate number of local rows and columns of the still remaining matrix
......@@ -367,12 +367,12 @@
l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
! Calculate vector for Householder transformation on all procs
! Calculate Vector for Householder transformation on all procs
! owning column istep
if (my_pcol == pcol(istep, nblk, np_cols)) then
! Get vector to be transformed; distribute last element and norm of
! Get Vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column
! copy l_cols + 1 column of A to v_row
......@@ -434,7 +434,7 @@
#endif
&PRECISION &
(vrl, vnorm2, xf, tau(istep))
! Scale v_row and store Householder vector for back transformation
! Scale v_row and store Householder Vector for back transformation
v_row(1:l_rows) = v_row(1:l_rows) * xf
if (my_prow == prow(istep-1, nblk, np_rows)) then
......@@ -444,7 +444,7 @@
e_vec(istep-1) = vrl
endif
! store Householder vector for back transformation
! store Householder Vector for back transformation
a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows)
! add tau after the end of actuall v_row, to be broadcasted with it
......@@ -452,7 +452,7 @@
endif !(my_pcol == pcol(istep, nblk, np_cols))
#ifdef WITH_MPI
! Broadcast the Householder vector (and tau) along columns
! Broadcast the Householder Vector (and tau) along columns
call MPI_Bcast(v_row, l_rows+1, MPI_MATH_DATATYPE_PRECISION, &
pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
#endif /* WITH_MPI */
......@@ -460,7 +460,7 @@
!recover tau, which has been broadcasted together with v_row
tau(istep) = v_row(l_rows+1)
! Transpose Householder vector v_row -> v_col
! Transpose Householder Vector v_row -> v_col
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
......@@ -533,7 +533,7 @@
! multiplication by blocks is efficient only for CPU
! for GPU we introduced 2 other ways, either by stripes (more simmilar to the original
! CPU implementation) or by one large matrix vector multiply
! CPU implementation) or by one large matrix Vector multiply
if (.not. useGPU) then
call timer%start("blas")
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
......@@ -755,7 +755,7 @@
#endif
enddo
! We have calculated another Hauseholder vector, number of implicitly stored increased
! We have calculated another Hauseholder Vector, number of implicitly stored increased
n_stored_vecs = n_stored_vecs+1
! If the limit of max_stored_uv is reached, calculate A + VU**T + UV**T
......
......@@ -646,7 +646,7 @@
#endif /* REALCASE == 1 */
do lc = n_cols, 1, -1
ncol = istep*nbw + lc ! absolute column number of householder vector
ncol = istep*nbw + lc ! absolute column number of householder Vector
nrow = ncol - nbw ! Absolute number of pivot row
lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
......@@ -660,10 +660,10 @@
if (my_pcol==cur_pcol) then
! Get vector to be transformed; distribute last element and norm of
! Get Vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column
vr(1:lr) = a(1:lr,lch) ! vector to be transformed
vr(1:lr) = a(1:lr,lch) ! Vector to be transformed
if (my_prow==prow(nrow, nblk, np_rows)) then
aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
......@@ -706,7 +706,7 @@
#endif
&PRECISION &
(vrl, vnorm2, xf, tau)
! Scale vr and store Householder vector for back transformation
! Scale vr and store Householder Vector for back transformation
vr(1:lr) = vr(1:lr) * xf
if (my_prow==prow(nrow, nblk, np_rows)) then
......@@ -724,7 +724,7 @@
endif
! Broadcast Householder vector and tau along columns
! Broadcast Householder Vector and tau along columns
vr(lr+1) = tau
#ifdef WITH_MPI
......@@ -759,7 +759,7 @@
#if COMPLEXCASE == 1
tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
#endif
! Transform remaining columns in current block with Householder vector
! Transform remaining columns in current block with Householder Vector
! Local dot product
#if REALCASE == 1
......
......@@ -343,7 +343,7 @@
ab(nb+2:,ns+i-1) = CONST_0_0
enddo
!send hh-vector
!send hh-Vector
if (iblk==nblocks) then
#ifdef WITH_MPI
call timer%start("mpi_communication")
......@@ -402,7 +402,7 @@
&(nb-nb2,nr,nb2,ab(nb+1-nb2,ns+nb2),2*nb-1,w_new,hv_new,work,nb)
endif
! Use new HH vector for the next block
! Use new HH Vector for the next block
hv(:,:) = hv_new(:,:)
tau = tau_new
enddo
......
......@@ -344,7 +344,7 @@
ns = 0
do lc = 1, n_cols
ncol = istep*nbw + lc ! absolute column number of householder vector
ncol = istep*nbw + lc ! absolute column number of householder Vector
nrow = ncol - nbw ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
......@@ -704,9 +704,9 @@
do lc = 1, n_cols
#ifdef BAND_TO_FULL_BLOCKING
ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder vector
ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder Vector
#else
ncol = istep*nbw + lc ! absolute column number of householder vector
ncol = istep*nbw + lc ! absolute column number of householder Vector
#endif
nrow = ncol - nbw ! absolute number of pivot row
......
......@@ -1486,7 +1486,7 @@
else ! (current_local_n > 1) then
! for current_local_n == 1 the one and only HH vector is 0 and not stored in hh_trans_real/complex
! for current_local_n == 1 the one and only HH Vector is 0 and not stored in hh_trans_real/complex
#if REALCASE == 1
bcast_buffer(:,1) = CONST_0_0
#endif
......
......@@ -295,7 +295,7 @@
call determine_workload(nx, nb, np_rows, limits)
local_size = limits(my_prow+1) - limits(my_prow)
! add to number of householder vectors
! please note: for nx==1 the one and only HH vector is 0 and is neither calculated nor send below!
! please note: for nx==1 the one and only HH Vector is 0 and is neither calculated nor send below!
if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
num_hh_vecs = num_hh_vecs + local_size
num_chunks = num_chunks+1
......@@ -423,7 +423,7 @@
stop 1
endif
hh_cnt(:) = 1 ! The first transfomation vector is always 0 and not calculated at all
hh_cnt(:) = 1 ! The first transfomation Vector is always 0 and not calculated at all
hh_dst(:) = 0 ! PE number for receive
#ifdef WITH_MPI
ireq_ab = MPI_REQUEST_NULL
......@@ -589,7 +589,7 @@
endif
else
if (na>na_s) then
! Receive Householder vector from previous task, from PE owning subdiagonal
! Receive Householder Vector from previous task, from PE owning subdiagonal
#ifdef WITH_OPENMP
......@@ -701,7 +701,7 @@
hv = hv_t(:,my_thread)
tau = tau_t(my_thread)
! Store Householder vector for back transformation
! Store Householder Vector for back transformation
hh_cnt(iblk) = hh_cnt(iblk) + 1
......@@ -911,7 +911,7 @@
else
! We are at the end of all blocks
! Send last HH vector and TAU to next PE if it has been calculated above
! Send last HH Vector and TAU to next PE if it has been calculated above
ne = na_s + nblocks*nb - (max_threads-1) - 1
if (istep>=max_threads .and. ne < na) then
#ifdef WITH_MPI
......@@ -937,7 +937,7 @@
#endif /* WITH_MPI */
endif
! "Send" HH vector and TAU to next OpenMP thread
! "Send" HH Vector and TAU to next OpenMP thread
do my_thread = max_threads, 2, -1
hv_t(:,my_thread) = hv_t(:,my_thread-1)
tau_t(my_thread) = tau_t(my_thread-1)
......@@ -953,7 +953,7 @@
! The following code is structured in a way to keep waiting times for
! other PEs at a minimum, especially if there is only one block.
! For this reason, it requests the last column as late as possible
! and sends the Householder vector and the first column as early
! and sends the Householder Vector and the first column as early
! as possible.
#endif /* WITH_OPENMP */
......@@ -964,7 +964,7 @@
if (ns+n_off>na) exit
! Store Householder vector for back transformation
! Store Householder Vector for back transformation
hh_cnt(iblk) = hh_cnt(iblk) + 1
......@@ -1020,14 +1020,14 @@
! The following code is structured in a way to keep waiting times for
! other PEs at a minimum, especially if there is only one block.
! For this reason, it requests the last column as late as possible
! and sends the Householder vector and the first column as early
! and sends the Householder Vector and the first column as early
! as possible.
#endif /* WITH_OPENMP */
nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
! Note that nr>=0 implies that diagonal block is full (nc==nb)!
! Multiply diagonal block and subdiagonal block with Householder vector
! Multiply diagonal block and subdiagonal block with Householder Vector
if (iblk==nblocks .and. nc==nb) then
......@@ -1312,7 +1312,7 @@
endif
endif
! Use new HH vector for the next block
! Use new HH Vector for the next block
hv(:) = hv_new(:)
tau = tau_new
......
......@@ -156,7 +156,7 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i
/**
* Unrolled kernel that computes
* 24 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_24_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
......@@ -470,7 +470,7 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i
/**
* Unrolled kernel that computes
* 16 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_16_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
......@@ -704,7 +704,7 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i
/**
* Unrolled kernel that computes
* 8 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_8_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
......@@ -858,7 +858,7 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_4_AVX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
......
......@@ -153,7 +153,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 24 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_24_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
......@@ -470,7 +470,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 16 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_16_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
......@@ -707,7 +707,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 8 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_8_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
......@@ -865,7 +865,7 @@ void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_4_sse_instead_of_avx_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
......@@ -965,7 +965,7 @@ __forceinline void hh_trafo_kernel_4_sse_instead_of_avx_2hv_single(float* q, flo
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_4_AVX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
......
......@@ -122,7 +122,7 @@ void quad_hh_trafo_real_avx_avx2_4hv_double(double* q, double* hh, int* pnb, int
double s_2_4 = hh[(ldh*3)+2];
double s_3_4 = hh[(ldh*3)+1];
// calculate scalar product of first and fourth householder vector
// calculate scalar product of first and fourth householder Vector
// loop counter = 2
s_1_2 += hh[2-1] * hh[(2+ldh)];
s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)];
......@@ -195,7 +195,7 @@ void quad_hh_trafo_real_avx_avx2_4hv_double(double* q, double* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 12 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline void hh_trafo_kernel_12_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
......@@ -723,7 +723,7 @@ __forceinline void hh_trafo_kernel_12_AVX_4hv_double(double* q, double* hh, int
/**
* Unrolled kernel that computes
* 8 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline void hh_trafo_kernel_8_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
......@@ -1078,7 +1078,7 @@ __forceinline void hh_trafo_kernel_8_AVX_4hv_double(double* q, double* hh, int n
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline void hh_trafo_kernel_4_AVX_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
......
......@@ -108,7 +108,7 @@ void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int*
float s_2_4 = hh[(ldh*3)+2];
float s_3_4 = hh[(ldh*3)+1];
// calculate scalar product of first and fourth householder vector
// calculate scalar product of first and fourth householder Vector
// loop counter = 2
s_1_2 += hh[2-1] * hh[(2+ldh)];
s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)];
......@@ -190,7 +190,7 @@ void quad_hh_trafo_real_avx_avx2_4hv_single(float* q, float* hh, int* pnb, int*
/**
* Unrolled kernel that computes
* 24 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline void hh_trafo_kernel_24_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
......@@ -737,7 +737,7 @@ __forceinline void hh_trafo_kernel_24_AVX_4hv_single(float* q, float* hh, int nb
/**
* Unrolled kernel that computes
* 16 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline void hh_trafo_kernel_16_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
......@@ -1092,7 +1092,7 @@ __forceinline void hh_trafo_kernel_16_AVX_4hv_single(float* q, float* hh, int nb
/**
* Unrolled kernel that computes
* 8 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline void hh_trafo_kernel_8_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
......@@ -1447,7 +1447,7 @@ __forceinline void hh_trafo_kernel_8_AVX_4hv_single(float* q, float* hh, int nb,
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline void hh_trafo_kernel_4_AVX_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
......
......@@ -147,7 +147,7 @@ void hexa_hh_trafo_real_avx_avx2_6hv_double(double* q, double* hh, int* pnb, int
scalarprods[13] = hh[(ldh*5)+2];
scalarprods[14] = hh[(ldh*5)+1];
// calculate scalar product of first and fourth householder vector
// calculate scalar product of first and fourth householder Vector
// loop counter = 2
scalarprods[0] += hh[1] * hh[(2+ldh)];
scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
......@@ -317,7 +317,7 @@ void hexa_hh_trafo_real_avx_avx2_6hv_double(double* q, double* hh, int* pnb, int
// scalarprods[13] = hh[(ldh*5)+2];
// scalarprods[14] = hh[(ldh*5)+1];
//
// // calculate scalar product of first and fourth householder vector
// // calculate scalar product of first and fourth householder Vector
// // loop counter = 2
// scalarprods[0] += hh[1] * hh[(2+ldh)];
// scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
......@@ -415,7 +415,7 @@ void hexa_hh_trafo_real_avx_avx2_6hv_double(double* q, double* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 8 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline void hh_trafo_kernel_8_AVX_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
......@@ -1188,7 +1188,7 @@ __forceinline void hh_trafo_kernel_8_AVX_6hv_double(double* q, double* hh, int n
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline void hh_trafo_kernel_4_AVX_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
......
......@@ -117,7 +117,7 @@ void hexa_hh_trafo_real_avx_avx2_6hv_single(float* q, float* hh, int* pnb, int*
scalarprods[13] = hh[(ldh*5)+2];
scalarprods[14] = hh[(ldh*5)+1];
// calculate scalar product of first and fourth householder vector
// calculate scalar product of first and fourth householder Vector
// loop counter = 2
scalarprods[0] += hh[1] * hh[(2+ldh)];
scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
......@@ -215,7 +215,7 @@ void hexa_hh_trafo_real_avx_avx2_6hv_single(float* q, float* hh, int* pnb, int*
/**
* Unrolled kernel that computes
* 16 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline void hh_trafo_kernel_16_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
......@@ -988,7 +988,7 @@ __forceinline void hh_trafo_kernel_16_AVX_6hv_single(float* q, float* hh, int nb
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 1 upsate is performed
*/
__forceinline void hh_trafo_kernel_4_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
......@@ -1559,7 +1559,7 @@ __forceinline void hh_trafo_kernel_4_AVX_6hv_single(float* q, float* hh, int nb,
/**
* Unrolled kernel that computes
* 8 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
......
......@@ -123,7 +123,7 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 32 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_32_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
......@@ -282,7 +282,7 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 24 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_24_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
......@@ -414,7 +414,7 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 16 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_16_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
......@@ -521,7 +521,7 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 8 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_8_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
......@@ -606,7 +606,7 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_4_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
......
......@@ -126,7 +126,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
/**
* Unrolled kernel that computes
* 64 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_64_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
......@@ -283,7 +283,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
/**
* Unrolled kernel that computes
* 48 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_48_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
......@@ -441,7 +441,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
/**
* Unrolled kernel that computes
* 32 rows of Q simultaneously, a
* matrix vector product with two householder
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_32_AVX512_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
......@@ -599,7 +599,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
/**