elpa_pdgeqrf.F90 69.7 KB
Newer Older
1
2
3
4
5
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
6
7
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
9
10
11
12
13
14
15
16
17
18
19
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
20
!    http://elpa.mpcdf.mpg.de/
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
module elpa_pdgeqrf

45
    use elpa1_compute
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
    use elpa_pdlarfb
    use qr_utils_mod

    implicit none

    PRIVATE

    public :: qr_pdgeqrf_2dcomm
    public :: qr_pqrparam_init
    public :: qr_pdlarfg2_1dcomm_check

    include 'mpif.h'

contains

subroutine qr_pdgeqrf_2dcomm(a,lda,v,ldv,tau,t,ldt,work,lwork,m,n,mb,nb,rowidx,colidx,rev,trans,PQRPARAM, &
                             mpicomm_rows,mpicomm_cols,blockheuristic)
Andreas Marek's avatar
Andreas Marek committed
63
    use precision
64
65
66
67
68
69
    use ELPA1
    use qr_utils_mod

    implicit none

    ! parameter setup
Andreas Marek's avatar
Andreas Marek committed
70
    INTEGER(kind=ik), parameter   :: gmode_ = 1, rank_ = 2, eps_ = 3
71
72

    ! input variables (local)
Andreas Marek's avatar
Andreas Marek committed
73
74
    integer(kind=ik)              :: lda,lwork,ldv,ldt
    real(kind=rk)                 :: a(lda,*),v(ldv,*),tau(*),work(*),t(ldt,*)
75
76

    ! input variables (global)
Andreas Marek's avatar
Andreas Marek committed
77
78
    integer(kind=ik)              :: m,n,mb,nb,rowidx,colidx,rev,trans,mpicomm_cols,mpicomm_rows
    integer(kind=ik)              :: PQRPARAM(*)
79
80

    ! output variables (global)
Andreas Marek's avatar
Andreas Marek committed
81
    real(kind=rk)                 :: blockheuristic(*)
82
83

    ! input variables derived from PQRPARAM
Andreas Marek's avatar
Andreas Marek committed
84
    integer(kind=ik)              :: updatemode,tmerge,size2d
85
86

    ! local scalars
Andreas Marek's avatar
Andreas Marek committed
87
88
89
90
91
92
93
94
95
96
97
98
99
100
    integer(kind=ik)              :: mpierr,mpirank_cols,broadcast_size,mpirank_rows
    integer(kind=ik)              :: mpirank_cols_qr,mpiprocs_cols
    integer(kind=ik)              :: lcols_temp,lcols,icol,lastcol
    integer(kind=ik)              :: baseoffset,offset,idx,voffset
    integer(kind=ik)              :: update_voffset,update_tauoffset
    integer(kind=ik)              :: update_lcols
    integer(kind=ik)              :: work_offset

    real(kind=rk)                 :: dbroadcast_size(1),dtmat_bcast_size(1)
    real(kind=rk)                 :: pdgeqrf_size(1),pdlarft_size(1),pdlarfb_size(1),tmerge_pdlarfb_size(1)
    integer(kind=ik)              :: temptau_offset,temptau_size,broadcast_offset,tmat_bcast_size
    integer(kind=ik)              :: remaining_cols
    integer(kind=ik)              :: total_cols
    integer(kind=ik)              :: incremental_update_size ! needed for incremental update mode
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317

    size2d = PQRPARAM(1)
    updatemode = PQRPARAM(2)
    tmerge = PQRPARAM(3)

    ! copy value before we are going to filter it
    total_cols = n

    call mpi_comm_rank(mpicomm_cols,mpirank_cols,mpierr)
    call mpi_comm_rank(mpicomm_rows,mpirank_rows,mpierr)
    call mpi_comm_size(mpicomm_cols,mpiprocs_cols,mpierr)


    call qr_pdgeqrf_1dcomm(a,lda,v,ldv,tau,t,ldt,pdgeqrf_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,trans, &
                           PQRPARAM(4),mpicomm_rows,blockheuristic)
    call qr_pdgeqrf_pack_unpack(v,ldv,dbroadcast_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,0,mpicomm_rows)
    call qr_pdgeqrf_pack_unpack_tmatrix(tau,t,ldt,dtmat_bcast_size(1),-1,total_cols,0)
    pdlarft_size(1) = 0.0d0
    call qr_pdlarfb_1dcomm(m,mb,total_cols,total_cols,a,lda,v,ldv,tau,t,ldt,rowidx,rowidx,rev,mpicomm_rows, &
                           pdlarfb_size(1),-1)
    call qr_tmerge_pdlarfb_1dcomm(m,mb,total_cols,total_cols,total_cols,v,ldv,t,ldt,a,lda,rowidx,rev,updatemode, &
                                  mpicomm_rows,tmerge_pdlarfb_size(1),-1)


    temptau_offset = 1
    temptau_size = total_cols
    broadcast_offset = temptau_offset + temptau_size
    broadcast_size = dbroadcast_size(1) + dtmat_bcast_size(1)
    work_offset = broadcast_offset + broadcast_size

    if (lwork .eq. -1) then
        work(1) = (DBLE(temptau_size) + DBLE(broadcast_size) + max(pdgeqrf_size(1),pdlarft_size(1),pdlarfb_size(1), &
                   tmerge_pdlarfb_size(1)))
        return
    end if

    lastcol = colidx-total_cols+1
    voffset = total_cols

    incremental_update_size = 0

    ! clear v buffer: just ensure that there is no junk in the upper triangle
    ! part, otherwise pdlarfb gets some problems
    ! pdlarfl(2) do not have these problems as they are working more on a vector
    ! basis
    v(1:ldv,1:total_cols) = 0.0d0

    icol = colidx

    remaining_cols = total_cols

    !print *,'start decomposition',m,rowidx,colidx

    do while (remaining_cols .gt. 0)

        ! determine rank of process column with next qr block
        mpirank_cols_qr = MOD((icol-1)/nb,mpiprocs_cols)

        ! lcols can't be larger than than nb
        ! exception: there is only one process column

        ! however, we might not start at the first local column.
        ! therefore assume a matrix of size (1xlcols) starting at (1,icol)
        ! determine the real amount of local columns
        lcols_temp = min(nb,(icol-lastcol+1))

        ! blocking parameter
        lcols_temp = max(min(lcols_temp,size2d),1)

        ! determine size from last decomposition column
        !  to first decomposition column
        call local_size_offset_1d(icol,nb,icol-lcols_temp+1,icol-lcols_temp+1,0, &
                                      mpirank_cols_qr,mpiprocs_cols, &
                                      lcols,baseoffset,offset)

        voffset = remaining_cols - lcols + 1

        idx = rowidx - colidx + icol

        if (mpirank_cols .eq. mpirank_cols_qr) then
            ! qr decomposition part

            tau(offset:offset+lcols-1) = 0.0d0

            call qr_pdgeqrf_1dcomm(a(1,offset),lda,v(1,voffset),ldv,tau(offset),t(voffset,voffset),ldt, &
                                   work(work_offset),lwork,m,lcols,mb,rowidx,idx,rev,trans,PQRPARAM(4), &
                                   mpicomm_rows,blockheuristic)

            ! pack broadcast buffer (v + tau)
            call qr_pdgeqrf_pack_unpack(v(1,voffset),ldv,work(broadcast_offset),lwork,m,lcols,mb,rowidx,&
                                        idx,rev,0,mpicomm_rows)

            ! determine broadcast size
            call qr_pdgeqrf_pack_unpack(v(1,voffset),ldv,dbroadcast_size(1),-1,m,lcols,mb,rowidx,idx,rev,&
                                        0,mpicomm_rows)
            broadcast_size = dbroadcast_size(1)

            !if (mpirank_rows .eq. 0) then
            ! pack tmatrix into broadcast buffer and calculate new size
            call qr_pdgeqrf_pack_unpack_tmatrix(tau(offset),t(voffset,voffset),ldt, &
                                                work(broadcast_offset+broadcast_size),lwork,lcols,0)
            call qr_pdgeqrf_pack_unpack_tmatrix(tau(offset),t(voffset,voffset),ldt,dtmat_bcast_size(1),-1,lcols,0)
            broadcast_size = broadcast_size + dtmat_bcast_size(1)
            !end if

            ! initiate broadcast (send part)
            call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real8, &
                           mpirank_cols_qr,mpicomm_cols,mpierr)

            ! copy tau parts into temporary tau buffer
            work(temptau_offset+voffset-1:temptau_offset+(voffset-1)+lcols-1) = tau(offset:offset+lcols-1)

            !print *,'generated tau:', tau(offset)
        else
            ! vector exchange part

            ! determine broadcast size
            call qr_pdgeqrf_pack_unpack(v(1,voffset),ldv,dbroadcast_size(1),-1,m,lcols,mb,rowidx,idx,rev,1,mpicomm_rows)
            broadcast_size = dbroadcast_size(1)

            call qr_pdgeqrf_pack_unpack_tmatrix(work(temptau_offset+voffset-1),t(voffset,voffset),ldt, &
                                                dtmat_bcast_size(1),-1,lcols,0)
            tmat_bcast_size = dtmat_bcast_size(1)

            !print *,'broadcast_size (nonqr)',broadcast_size
            broadcast_size = dbroadcast_size(1) + dtmat_bcast_size(1)

            ! initiate broadcast (recv part)
            call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real8, &
                           mpirank_cols_qr,mpicomm_cols,mpierr)

            ! last n*n elements in buffer are (still empty) T matrix elements
            ! fetch from first process in each column

            ! unpack broadcast buffer (v + tau)
            call qr_pdgeqrf_pack_unpack(v(1,voffset),ldv,work(broadcast_offset),lwork,m,lcols,mb,rowidx,idx,rev,1,mpicomm_rows)

            ! now send t matrix to other processes in our process column
            broadcast_size = dbroadcast_size(1)
            tmat_bcast_size = dtmat_bcast_size(1)

            ! t matrix should now be available on all processes => unpack
            call qr_pdgeqrf_pack_unpack_tmatrix(work(temptau_offset+voffset-1),t(voffset,voffset),ldt, &
                                                work(broadcast_offset+broadcast_size),lwork,lcols,1)
        end if

        remaining_cols = remaining_cols - lcols

        ! apply householder vectors to whole trailing matrix parts (if any)

        update_voffset = voffset
        update_tauoffset = icol
        update_lcols = lcols
        incremental_update_size = incremental_update_size + lcols

        icol = icol - lcols
        ! count colums from first column of global block to current index
        call local_size_offset_1d(icol,nb,colidx-n+1,colidx-n+1,0, &
                                      mpirank_cols,mpiprocs_cols, &
                                      lcols,baseoffset,offset)

        if (lcols .gt. 0) then

            !print *,'updating trailing matrix'

			if (updatemode .eq. ichar('I')) then
				print *,'pdgeqrf_2dcomm: incremental update not yet implemented! rev=1'
			else if (updatemode .eq. ichar('F')) then
				! full update no merging
				call qr_pdlarfb_1dcomm(m,mb,lcols,update_lcols,a(1,offset),lda,v(1,update_voffset),ldv, &
							work(temptau_offset+update_voffset-1),                          &
                                                        t(update_voffset,update_voffset),ldt, &
							rowidx,idx,1,mpicomm_rows,work(work_offset),lwork)
			else
				! full update + merging default
				call qr_tmerge_pdlarfb_1dcomm(m,mb,lcols,n-(update_voffset+update_lcols-1),update_lcols, &
                                                              v(1,update_voffset),ldv, &
							      t(update_voffset,update_voffset),ldt, &
							      a(1,offset),lda,rowidx,1,updatemode,mpicomm_rows, &
                                                              work(work_offset),lwork)
			end if
        else
			if (updatemode .eq. ichar('I')) then
				print *,'sole merging of (incremental) T matrix', mpirank_cols,  &
                                        n-(update_voffset+incremental_update_size-1)
				call qr_tmerge_pdlarfb_1dcomm(m,mb,0,n-(update_voffset+incremental_update_size-1),   &
                                                              incremental_update_size,v(1,update_voffset),ldv, &
							      t(update_voffset,update_voffset),ldt, &
							      a,lda,rowidx,1,updatemode,mpicomm_rows,work(work_offset),lwork)

				! reset for upcoming incremental updates
				incremental_update_size = 0
			else if (updatemode .eq. ichar('M')) then
				! final merge
				call qr_tmerge_pdlarfb_1dcomm(m,mb,0,n-(update_voffset+update_lcols-1),update_lcols, &
                                                              v(1,update_voffset),ldv, &
							      t(update_voffset,update_voffset),ldt, &
							      a,lda,rowidx,1,updatemode,mpicomm_rows,work(work_offset),lwork)
			else
				! full updatemode - nothing to update
			end if

			! reset for upcoming incremental updates
			incremental_update_size = 0
        end if
    end do

    if ((tmerge .gt. 0) .and. (updatemode .eq. ichar('F'))) then
        ! finally merge all small T parts
        call qr_pdlarft_tree_merge_1dcomm(m,mb,n,size2d,tmerge,v,ldv,t,ldt,rowidx,rev,mpicomm_rows,work,lwork)
    end if

    !print *,'stop decomposition',rowidx,colidx

end subroutine qr_pdgeqrf_2dcomm

subroutine qr_pdgeqrf_1dcomm(a,lda,v,ldv,tau,t,ldt,work,lwork,m,n,mb,baseidx,rowidx,rev,trans,PQRPARAM,mpicomm,blockheuristic)
Andreas Marek's avatar
Andreas Marek committed
318
    use precision
319
320
321
322
323
    use ELPA1

    implicit none

    ! parameter setup
Andreas Marek's avatar
Andreas Marek committed
324
    INTEGER(kind=ik), parameter  :: gmode_ = 1,rank_ = 2,eps_ = 3
325
326

    ! input variables (local)
Andreas Marek's avatar
Andreas Marek committed
327
328
    integer(kind=ik)             :: lda,lwork,ldv,ldt
    real(kind=rk)                :: a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(*)
329
330

    ! input variables (global)
Andreas Marek's avatar
Andreas Marek committed
331
332
    integer(kind=ik)             :: m,n,mb,baseidx,rowidx,rev,trans,mpicomm
    integer(kind=ik)             :: PQRPARAM(*)
333
334
335
336

    ! derived input variables

    ! derived further input variables from QR_PQRPARAM
Andreas Marek's avatar
Andreas Marek committed
337
    integer(kind=ik)             :: size1d,updatemode,tmerge
338
339

    ! output variables (global)
Andreas Marek's avatar
Andreas Marek committed
340
    real(kind=rk)                :: blockheuristic(*)
341
342

    ! local scalars
Andreas Marek's avatar
Andreas Marek committed
343
344
    integer(kind=ik)             :: nr_blocks,remainder,current_block,aoffset,idx,updatesize
    real(kind=rk)                :: pdgeqr2_size(1),pdlarfb_size(1),tmerge_tree_size(1)
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429

    size1d = max(min(PQRPARAM(1),n),1)
    updatemode = PQRPARAM(2)
    tmerge = PQRPARAM(3)

    if (lwork .eq. -1) then
        call qr_pdgeqr2_1dcomm(a,lda,v,ldv,tau,t,ldt,pdgeqr2_size,-1, &
                                m,size1d,mb,baseidx,baseidx,rev,trans,PQRPARAM(4),mpicomm,blockheuristic)

        ! reserve more space for incremental mode
        call qr_tmerge_pdlarfb_1dcomm(m,mb,n,n,n,v,ldv,t,ldt, &
                                       a,lda,baseidx,rev,updatemode,mpicomm,pdlarfb_size,-1)

        call qr_pdlarft_tree_merge_1dcomm(m,mb,n,size1d,tmerge,v,ldv,t,ldt,baseidx,rev,mpicomm,tmerge_tree_size,-1)

        work(1) = max(pdlarfb_size(1),pdgeqr2_size(1),tmerge_tree_size(1))
        return
    end if

        nr_blocks = n / size1d
        remainder = n - nr_blocks*size1d

        current_block = 0
        do while (current_block .lt. nr_blocks)
            idx = rowidx-current_block*size1d
            updatesize = n-(current_block+1)*size1d
            aoffset = 1+updatesize

            call qr_pdgeqr2_1dcomm(a(1,aoffset),lda,v(1,aoffset),ldv,tau(aoffset),t(aoffset,aoffset),ldt,work,lwork, &
                                    m,size1d,mb,baseidx,idx,1,trans,PQRPARAM(4),mpicomm,blockheuristic)

            if (updatemode .eq. ichar('M')) then
                ! full update + merging
                call qr_tmerge_pdlarfb_1dcomm(m,mb,updatesize,current_block*size1d,size1d, &
                                               v(1,aoffset),ldv,t(aoffset,aoffset),ldt, &
                                               a,lda,baseidx,1,ichar('F'),mpicomm,work,lwork)
            else if (updatemode .eq. ichar('I')) then
                if (updatesize .ge. size1d) then
                    ! incremental update + merging
                    call qr_tmerge_pdlarfb_1dcomm(m,mb,size1d,current_block*size1d,size1d, &
                                                   v(1,aoffset),ldv,t(aoffset,aoffset),ldt, &
                                                   a(1,aoffset-size1d),lda,baseidx,1,updatemode,mpicomm,work,lwork)

                else ! only remainder left
                    ! incremental update + merging
                    call qr_tmerge_pdlarfb_1dcomm(m,mb,remainder,current_block*size1d,size1d, &
                                                   v(1,aoffset),ldv,t(aoffset,aoffset),ldt, &
                                                   a(1,1),lda,baseidx,1,updatemode,mpicomm,work,lwork)
                end if
            else ! full update no merging is default
                ! full update no merging
                call qr_pdlarfb_1dcomm(m,mb,updatesize,size1d,a,lda,v(1,aoffset),ldv, &
                                        tau(aoffset),t(aoffset,aoffset),ldt,baseidx,idx,1,mpicomm,work,lwork)
            end if

            ! move on to next block
            current_block = current_block+1
        end do

        if (remainder .gt. 0) then
            aoffset = 1
            idx = rowidx-size1d*nr_blocks
            call qr_pdgeqr2_1dcomm(a(1,aoffset),lda,v,ldv,tau,t,ldt,work,lwork, &
                                    m,remainder,mb,baseidx,idx,1,trans,PQRPARAM(4),mpicomm,blockheuristic)

            if ((updatemode .eq. ichar('I')) .or. (updatemode .eq. ichar('M'))) then
                ! final merging
                call qr_tmerge_pdlarfb_1dcomm(m,mb,0,size1d*nr_blocks,remainder, &
                                               v,ldv,t,ldt, &
                                               a,lda,baseidx,1,updatemode,mpicomm,work,lwork) ! updatemode argument does not matter
            end if
        end if

    if ((tmerge .gt. 0) .and. (updatemode .eq. ichar('F'))) then
        ! finally merge all small T parts
        call qr_pdlarft_tree_merge_1dcomm(m,mb,n,size1d,tmerge,v,ldv,t,ldt,baseidx,rev,mpicomm,work,lwork)
    end if

end subroutine qr_pdgeqrf_1dcomm

! local a and tau are assumed to be positioned at the right column from a local
! perspective
! TODO: if local amount of data turns to zero the algorithm might produce wrong
! results (probably due to old buffer contents)
subroutine qr_pdgeqr2_1dcomm(a,lda,v,ldv,tau,t,ldt,work,lwork,m,n,mb,baseidx,rowidx,rev,trans,PQRPARAM,mpicomm,blockheuristic)
Andreas Marek's avatar
Andreas Marek committed
430
    use precision
431
432
433
434
435
    use ELPA1

    implicit none

    ! parameter setup
Andreas Marek's avatar
Andreas Marek committed
436
    INTEGER(kind=ik), parameter   :: gmode_ = 1,rank_ = 2 ,eps_ = 3, upmode1_ = 4
437
438

    ! input variables (local)
Andreas Marek's avatar
Andreas Marek committed
439
440
    integer(kind=ik)              :: lda,lwork,ldv,ldt
    real(kind=rk)                 :: a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(*)
441
442

    ! input variables (global)
Andreas Marek's avatar
Andreas Marek committed
443
444
    integer(kind=ik)              :: m,n,mb,baseidx,rowidx,rev,trans,mpicomm
    integer(kind=ik)              :: PQRPARAM(*)
445
446

    ! output variables (global)
Andreas Marek's avatar
Andreas Marek committed
447
    real(kind=rk)                 :: blockheuristic(*)
448
449

    ! derived further input variables from QR_PQRPARAM
Andreas Marek's avatar
Andreas Marek committed
450
    integer(kind=ik)              ::  maxrank,hgmode,updatemode
451
452

    ! local scalars
Andreas Marek's avatar
Andreas Marek committed
453
454
455
456
457
458
459
460
    integer(kind=ik)              :: icol,incx,idx
    real(kind=rk)                 :: pdlarfg_size(1),pdlarf_size(1),total_size
    real(kind=rk)                 :: pdlarfg2_size(1),pdlarfgk_size(1),pdlarfl2_size(1)
    real(kind=rk)                 :: pdlarft_size(1),pdlarfb_size(1),pdlarft_pdlarfb_size(1),tmerge_pdlarfb_size(1)
    integer(kind=ik)              :: mpirank,mpiprocs,mpierr
    integer(kind=ik)              :: rank,lastcol,actualrank,nextrank
    integer(kind=ik)              :: update_cols,decomposition_cols
    integer(kind=ik)              :: current_column
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574

    maxrank = min(PQRPARAM(1),n)
    updatemode = PQRPARAM(2)
    hgmode = PQRPARAM(4)

    call MPI_Comm_rank(mpicomm, mpirank, mpierr)
    call MPI_Comm_size(mpicomm, mpiprocs, mpierr)

    if (trans .eq. 1) then
        incx = lda
    else
        incx = 1
    end if

    if (lwork .eq. -1) then
        call qr_pdlarfg_1dcomm(a,incx,tau(1),pdlarfg_size(1),-1,n,rowidx,mb,hgmode,rev,mpicomm)
        call qr_pdlarfl_1dcomm(v,1,baseidx,a,lda,tau(1),pdlarf_size(1),-1,m,n,rowidx,mb,rev,mpicomm)
        call qr_pdlarfg2_1dcomm_ref(a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfg2_size(1),-1,m,rowidx,mb,PQRPARAM,rev,mpicomm,actualrank)
        call qr_pdlarfgk_1dcomm(a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfgk_size(1),-1,m,n,rowidx,mb,PQRPARAM,rev,mpicomm,actualrank)
        call qr_pdlarfl2_tmatrix_1dcomm(v,ldv,baseidx,a,lda,t,ldt,pdlarfl2_size(1),-1,m,n,rowidx,mb,rev,mpicomm)
        pdlarft_size(1) = 0.0d0
        call qr_pdlarfb_1dcomm(m,mb,n,n,a,lda,v,ldv,tau,t,ldt,baseidx,rowidx,1,mpicomm,pdlarfb_size(1),-1)
        pdlarft_pdlarfb_size(1) = 0.0d0
        call qr_tmerge_pdlarfb_1dcomm(m,mb,n,n,n,v,ldv,t,ldt,a,lda,rowidx,rev,updatemode,mpicomm,tmerge_pdlarfb_size(1),-1)

        total_size = max(pdlarfg_size(1),pdlarf_size(1),pdlarfg2_size(1),pdlarfgk_size(1),pdlarfl2_size(1),pdlarft_size(1), &
                         pdlarfb_size(1),pdlarft_pdlarfb_size(1),tmerge_pdlarfb_size(1))

        work(1) = total_size
        return
    end if

        icol = 1
        lastcol = min(rowidx,n)
        decomposition_cols = lastcol
        update_cols = n
        do while (decomposition_cols .gt. 0) ! local qr block
            icol = lastcol-decomposition_cols+1
            idx = rowidx-icol+1

            ! get possible rank size
            ! limited by number of columns and remaining rows
            rank = min(n-icol+1,maxrank,idx)

            current_column = n-icol+1-rank+1

            if (rank .eq. 1) then

                call qr_pdlarfg_1dcomm(a(1,current_column),incx, &
                                        tau(current_column),work,lwork, &
                                        m,idx,mb,hgmode,1,mpicomm)

                v(1:ldv,current_column) = 0.0d0
                call qr_pdlarfg_copy_1dcomm(a(1,current_column),incx, &
                                             v(1,current_column),1, &
                                             m,baseidx,idx,mb,1,mpicomm)

                ! initialize t matrix part
                t(current_column,current_column) = tau(current_column)

                actualrank = 1

            else if (rank .eq. 2) then
                call qr_pdlarfg2_1dcomm_ref(a(1,current_column),lda,tau(current_column), &
                                             t(current_column,current_column),ldt,v(1,current_column),ldv, &
                                            baseidx,work,lwork,m,idx,mb,PQRPARAM,1,mpicomm,actualrank)

            else
                call qr_pdlarfgk_1dcomm(a(1,current_column),lda,tau(current_column), &
                                         t(current_column,current_column),ldt,v(1,current_column),ldv, &
                                         baseidx,work,lwork,m,rank,idx,mb,PQRPARAM,1,mpicomm,actualrank)

            end if

            blockheuristic(actualrank) = blockheuristic(actualrank) + 1

            ! the blocked decomposition versions already updated their non
            ! decomposed parts using their information after communication
            update_cols = decomposition_cols - rank
            decomposition_cols = decomposition_cols - actualrank

            ! needed for incremental update
            nextrank = min(n-(lastcol-decomposition_cols+1)+1,maxrank,rowidx-(lastcol-decomposition_cols+1)+1)

            if (current_column .gt. 1) then
                idx = rowidx-icol+1

                if (updatemode .eq. ichar('I')) then
                    ! incremental update + merging
                    call qr_tmerge_pdlarfb_1dcomm(m,mb,nextrank-(rank-actualrank),n-(current_column+rank-1),actualrank, &
                                                  v(1,current_column+(rank-actualrank)),ldv, &
                                                  t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
                                                  a(1,current_column-nextrank+(rank-actualrank)),lda,baseidx,rev,updatemode,&
                                                  mpicomm,work,lwork)
                else
                    ! full update + merging
                    call qr_tmerge_pdlarfb_1dcomm(m,mb,update_cols,n-(current_column+rank-1),actualrank, &
                                                  v(1,current_column+(rank-actualrank)),ldv, &
                                                  t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
                                                  a(1,1),lda,baseidx,rev,updatemode,mpicomm,work,lwork)
                end if
            else
                call qr_tmerge_pdlarfb_1dcomm(m,mb,0,n-(current_column+rank-1),actualrank,v(1,current_column+(rank-actualrank)), &
                                              ldv, &
                                              t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
                                              a,lda,baseidx,rev,updatemode,mpicomm,work,lwork)
            end if

        end do
end subroutine qr_pdgeqr2_1dcomm

! incx == 1: column major
! incx != 1: row major
subroutine qr_pdlarfg_1dcomm(x,incx,tau,work,lwork,n,idx,nb,hgmode,rev,mpi_comm)
Andreas Marek's avatar
Andreas Marek committed
575
576

    use precision
577
578
579
580
    use ELPA1
    use qr_utils_mod

    implicit none
Andreas Marek's avatar
Andreas Marek committed
581

582
    ! parameter setup
Andreas Marek's avatar
Andreas Marek committed
583
    INTEGER(kind=ik), parameter    :: gmode_ = 1,rank_ = 2, eps_ = 3
584
585

    ! input variables (local)
Andreas Marek's avatar
Andreas Marek committed
586
587
    integer(kind=ik)               :: incx,lwork,hgmode
    real(kind=rk)                  :: x(*),work(*)
588
589

    ! input variables (global)
Andreas Marek's avatar
Andreas Marek committed
590
    integer(kind=ik)               :: mpi_comm,nb,idx,n,rev
591
592

    ! output variables (global)
Andreas Marek's avatar
Andreas Marek committed
593
    real(kind=rk)                  :: tau
594
595

    ! local scalars
Andreas Marek's avatar
Andreas Marek committed
596
597
598
599
600
    integer(kind=ik)               :: mpierr,mpirank,mpiprocs,mpirank_top
    integer(kind=ik)               :: sendsize,recvsize
    integer(kind=ik)               :: local_size,local_offset,baseoffset
    integer(kind=ik)               :: topidx,top,iproc
    real(kind=rk)                  :: alpha,xnorm,dot,xf
601
602

    ! external functions
Andreas Marek's avatar
Andreas Marek committed
603
604
    real(kind=rk), external        :: ddot,dlapy2,dnrm2
    external                       :: dscal
605
606

    ! intrinsic
Andreas Marek's avatar
Andreas Marek committed
607
!    intrinsic sign
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718

	if (idx .le. 1) then
		tau = 0.0d0
		return
	end if

    call MPI_Comm_rank(mpi_comm, mpirank, mpierr)
    call MPI_Comm_size(mpi_comm, mpiprocs, mpierr)

    ! calculate expected work size and store in work(1)
    if (hgmode .eq. ichar('s')) then
        ! allreduce (MPI_SUM)
        sendsize = 2
        recvsize = sendsize
    else if (hgmode .eq. ichar('x')) then
        ! alltoall
        sendsize = mpiprocs*2
        recvsize = sendsize
    else if (hgmode .eq. ichar('g')) then
        ! allgather
        sendsize = 2
        recvsize = mpiprocs*sendsize
    else
        ! no exchange at all (benchmarking)
        sendsize = 2
        recvsize = sendsize
    end if

    if (lwork .eq. -1) then
        work(1) = DBLE(sendsize + recvsize)
        return
    end if

    ! Processor id for global index of top element
    mpirank_top = MOD((idx-1)/nb,mpiprocs)
    if (mpirank .eq. mpirank_top) then
        topidx = local_index(idx,mpirank_top,mpiprocs,nb,0)
        top = 1+(topidx-1)*incx
    end if

	call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
							  local_size,baseoffset,local_offset)

    local_offset = local_offset * incx

    ! calculate and exchange information
    if (hgmode .eq. ichar('s')) then
        if (mpirank .eq. mpirank_top) then
            alpha = x(top)
        else
            alpha = 0.0d0
        end if

        dot = ddot(local_size, &
                   x(local_offset), incx, &
                   x(local_offset), incx)

        work(1) = alpha
        work(2) = dot

        call mpi_allreduce(work(1),work(sendsize+1), &
                           sendsize,mpi_real8,mpi_sum, &
                           mpi_comm,mpierr)

        alpha = work(sendsize+1)
        xnorm = sqrt(work(sendsize+2))
    else if (hgmode .eq. ichar('x')) then
        if (mpirank .eq. mpirank_top) then
            alpha = x(top)
        else
            alpha = 0.0d0
        end if

        xnorm = dnrm2(local_size, x(local_offset), incx)

        do iproc=0,mpiprocs-1
            work(2*iproc+1) = alpha
            work(2*iproc+2) = xnorm
        end do

        call mpi_alltoall(work(1),2,mpi_real8, &
                          work(sendsize+1),2,mpi_real8, &
                          mpi_comm,mpierr)

        ! extract alpha value
        alpha = work(sendsize+1+mpirank_top*2)

        ! copy norm parts of buffer to beginning
        do iproc=0,mpiprocs-1
            work(iproc+1) = work(sendsize+1+2*iproc+1)
        end do

        xnorm = dnrm2(mpiprocs, work(1), 1)
    else if (hgmode .eq. ichar('g')) then
        if (mpirank .eq. mpirank_top) then
            alpha = x(top)
        else
            alpha = 0.0d0
        end if

        xnorm = dnrm2(local_size, x(local_offset), incx)
        work(1) = alpha
        work(2) = xnorm

        ! allgather
        call mpi_allgather(work(1),sendsize,mpi_real8, &
                          work(sendsize+1),sendsize,mpi_real8, &
                          mpi_comm,mpierr)

        ! extract alpha value
        alpha = work(sendsize+1+mpirank_top*2)
Andreas Marek's avatar
Andreas Marek committed
719

720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
        ! copy norm parts of buffer to beginning
        do iproc=0,mpiprocs-1
            work(iproc+1) = work(sendsize+1+2*iproc+1)
        end do

        xnorm = dnrm2(mpiprocs, work(1), 1)
    else
        ! dnrm2
        xnorm = dnrm2(local_size, x(local_offset), incx)

        if (mpirank .eq. mpirank_top) then
            alpha = x(top)
        else
            alpha = 0.0d0
        end if

        ! no exchange at all (benchmarking)

        xnorm = 0.0d0
    end if

    !print *,'ref hg:', idx,xnorm,alpha
    !print *,x(1:n)

    ! calculate householder information
    if (xnorm .eq. 0.0d0) then
        ! H = I

        tau = 0.0d0
    else
        ! General case

        call hh_transform_real(alpha,xnorm**2,xf,tau)
        if (mpirank .eq. mpirank_top) then
            x(top) = alpha
        end if

        call dscal(local_size, xf, &
                   x(local_offset), incx)

        ! TODO: reimplement norm rescale method of
        ! original PDLARFG using mpi?

    end if

    ! useful for debugging
    !print *,'hg:mpirank,idx,beta,alpha:',mpirank,idx,beta,alpha,1.0d0/(beta+alpha),tau
    !print *,x(1:n)

end subroutine qr_pdlarfg_1dcomm

subroutine qr_pdlarfg2_1dcomm_ref(a,lda,tau,t,ldt,v,ldv,baseidx,work,lwork,m,idx,mb,PQRPARAM,rev,mpicomm,actualk)
Andreas Marek's avatar
Andreas Marek committed
772
    use precision
773
774
775
    implicit none

    ! parameter setup
Andreas Marek's avatar
Andreas Marek committed
776
    INTEGER(kind=ik), parameter    :: gmode_ = 1,rank_ = 2,eps_ = 3, upmode1_ = 4
777
    ! input variables (local)
Andreas Marek's avatar
Andreas Marek committed
778
779
    integer(kind=ik)               :: lda,lwork,ldv,ldt
    real(kind=rk)                  :: a(lda,*),v(ldv,*),tau(*),work(*),t(ldt,*)
780
781

    ! input variables (global)
Andreas Marek's avatar
Andreas Marek committed
782
783
    integer(kind=ik)               :: m,idx,baseidx,mb,rev,mpicomm
    integer(kind=ik)               :: PQRPARAM(*)
784
785

    ! output variables (global)
Andreas Marek's avatar
Andreas Marek committed
786
    integer(kind=ik)               :: actualk
787
788

    ! derived input variables from QR_PQRPARAM
Andreas Marek's avatar
Andreas Marek committed
789
    integer(kind=ik)               :: eps
790
791

    ! local scalars
Andreas Marek's avatar
Andreas Marek committed
792
793
794
795
    real(kind=rk)                  :: dseedwork_size(1)
    integer(kind=ik)               :: seedwork_size,seed_size
    integer(kind=ik)               :: seedwork_offset,seed_offset
    logical                        :: accurate
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873

    call qr_pdlarfg2_1dcomm_seed(a,lda,dseedwork_size(1),-1,work,m,mb,idx,rev,mpicomm)
    seedwork_size = dseedwork_size(1)
    seed_size = seedwork_size

    if (lwork .eq. -1) then
        work(1) = seedwork_size + seed_size
        return
    end if

    seedwork_offset = 1
    seed_offset = seedwork_offset + seedwork_size

    eps = PQRPARAM(3)

    ! check for border cases (only a 2x2 matrix left)
	if (idx .le. 1) then
		tau(1:2) = 0.0d0
		t(1:2,1:2) = 0.0d0
		return
	end if

    call qr_pdlarfg2_1dcomm_seed(a,lda,work(seedwork_offset),lwork,work(seed_offset),m,mb,idx,rev,mpicomm)

        if (eps .gt. 0) then
            accurate = qr_pdlarfg2_1dcomm_check(work(seed_offset),eps)
        else
            accurate = .true.
        end if

        call qr_pdlarfg2_1dcomm_vector(a(1,2),1,tau(2),work(seed_offset), &
                                        m,mb,idx,0,1,mpicomm)

        call qr_pdlarfg_copy_1dcomm(a(1,2),1, &
                                     v(1,2),1, &
                                     m,baseidx,idx,mb,1,mpicomm)

        call qr_pdlarfg2_1dcomm_update(v(1,2),1,baseidx,a(1,1),lda,work(seed_offset),m,idx,mb,rev,mpicomm)

        ! check for 2x2 matrix case => only one householder vector will be
        ! generated
        if (idx .gt. 2) then
            if (accurate .eqv. .true.) then
                call qr_pdlarfg2_1dcomm_vector(a(1,1),1,tau(1),work(seed_offset), &
                                                m,mb,idx-1,1,1,mpicomm)

                call qr_pdlarfg_copy_1dcomm(a(1,1),1, &
                                             v(1,1),1, &
                                             m,baseidx,idx-1,mb,1,mpicomm)

                ! generate fuse element
                call qr_pdlarfg2_1dcomm_finalize_tmatrix(work(seed_offset),tau,t,ldt)

                actualk = 2
            else
                t(1,1) = 0.0d0
                t(1,2) = 0.0d0
                t(2,2) = tau(2)

                actualk = 1
            end if
        else
            t(1,1) = 0.0d0
            t(1,2) = 0.0d0
            t(2,2) = tau(2)

            ! no more vectors to create

            tau(1) = 0.0d0

            actualk = 2

            !print *,'rank2: no more data'
        end if

end subroutine qr_pdlarfg2_1dcomm_ref

subroutine qr_pdlarfg2_1dcomm_seed(a,lda,work,lwork,seed,n,nb,idx,rev,mpicomm)
Andreas Marek's avatar
Andreas Marek committed
874
    use precision
875
876
877
878
879
880
    use ELPA1
    use qr_utils_mod

    implicit none

    ! input variables (local)
Andreas Marek's avatar
Andreas Marek committed
881
882
    integer(kind=ik)        :: lda,lwork
    real(kind=rk)           :: a(lda,*),work(*),seed(*)
883
884

    ! input variables (global)
Andreas Marek's avatar
Andreas Marek committed
885
    integer(kind=ik)        :: n,nb,idx,rev,mpicomm
886
887
888
889

    ! output variables (global)

    ! external functions
Andreas Marek's avatar
Andreas Marek committed
890
    real(kind=rk), external :: ddot
891
892

    ! local scalars
Andreas Marek's avatar
Andreas Marek committed
893
894
895
896
897
898
899
900
    real(kind=rk)           :: top11,top21,top12,top22
    real(kind=rk)           :: dot11,dot12,dot22
    integer(kind=ik)        :: mpirank,mpiprocs,mpierr
    integer(kind=ik)        :: mpirank_top11,mpirank_top21
    integer(kind=ik)        :: top11_offset,top21_offset
    integer(kind=ik)        :: baseoffset
    integer(kind=ik)        :: local_offset1,local_size1
    integer(kind=ik)        :: local_offset2,local_size2
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958

    if (lwork .eq. -1) then
        work(1) = DBLE(8)
        return
    end if

    call MPI_Comm_rank(mpicomm, mpirank, mpierr)
    call MPI_Comm_size(mpicomm, mpiprocs, mpierr)

        call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
                              local_size1,baseoffset,local_offset1)

        call local_size_offset_1d(n,nb,idx,idx-2,rev,mpirank,mpiprocs, &
                              local_size2,baseoffset,local_offset2)

        mpirank_top11 = MOD((idx-1)/nb,mpiprocs)
        mpirank_top21 = MOD((idx-2)/nb,mpiprocs)

        top11_offset = local_index(idx,mpirank_top11,mpiprocs,nb,0)
        top21_offset = local_index(idx-1,mpirank_top21,mpiprocs,nb,0)

        if (mpirank_top11 .eq. mpirank) then
            top11 = a(top11_offset,2)
            top12 = a(top11_offset,1)
        else
            top11 = 0.0d0
            top12 = 0.0d0
        end if

        if (mpirank_top21 .eq. mpirank) then
            top21 = a(top21_offset,2)
            top22 = a(top21_offset,1)
        else
            top21 = 0.0d0
            top22 = 0.0d0
        end if

        ! calculate 3 dot products
        dot11 = ddot(local_size1,a(local_offset1,2),1,a(local_offset1,2),1)
        dot12 = ddot(local_size1,a(local_offset1,2),1,a(local_offset1,1),1)
        dot22 = ddot(local_size2,a(local_offset2,1),1,a(local_offset2,1),1)

    ! store results in work buffer
    work(1) = top11
    work(2) = dot11
    work(3) = top12
    work(4) = dot12
    work(5) = top21
    work(6) = top22
    work(7) = dot22
    work(8) = 0.0d0 ! fill up buffer

    ! exchange partial results
    call mpi_allreduce(work, seed, 8, mpi_real8, mpi_sum, &
                       mpicomm, mpierr)
end subroutine qr_pdlarfg2_1dcomm_seed

logical function qr_pdlarfg2_1dcomm_check(seed,eps)
Andreas Marek's avatar
Andreas Marek committed
959
    use precision
960
961
962
    implicit none

    ! input variables
Andreas Marek's avatar
Andreas Marek committed
963
964
    real(kind=rk)    ::  seed(*)
    integer(kind=ik) :: eps
965
966

    ! local scalars
Andreas Marek's avatar
Andreas Marek committed
967
968
969
970
    real(kind=rk)    :: epsd,first,second,first_second,estimate
    logical          :: accurate
    real(kind=rk)    :: dot11,dot12,dot22
    real(kind=rk)    :: top11,top12,top21,top22
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007

    EPSD = EPS

    top11 = seed(1)
    dot11 = seed(2)
    top12 = seed(3)
    dot12 = seed(4)

    top21 = seed(5)
    top22 = seed(6)
    dot22 = seed(7)

    ! reconstruct the whole inner products
    ! (including squares of the top elements)
    first = dot11 + top11*top11
    second = dot22 + top22*top22 + top12*top12
    first_second = dot12 + top11*top12

    ! zero Householder vector (zero norm) case
    if (first*second .eq. 0.0d0) then
       qr_pdlarfg2_1dcomm_check = .false.
       return
    end if

    estimate = abs((first_second*first_second)/(first*second))

    !print *,'estimate:',estimate

    ! if accurate the following check holds
    accurate = (estimate .LE. (epsd/(1.0d0+epsd)))

    qr_pdlarfg2_1dcomm_check = accurate
end function qr_pdlarfg2_1dcomm_check

! id=0: first vector
! id=1: second vector
subroutine qr_pdlarfg2_1dcomm_vector(x,incx,tau,seed,n,nb,idx,id,rev,mpicomm)
Andreas Marek's avatar
Andreas Marek committed
1008
    use precision
1009
1010
1011
1012
1013
1014
    use ELPA1
    use qr_utils_mod

    implicit none

    ! input variables (local)
Andreas Marek's avatar
Andreas Marek committed
1015
1016
    integer(kind=ik)        :: incx
    real(kind=rk)           :: x(*),seed(*),tau
1017
1018

    ! input variables (global)
Andreas Marek's avatar
Andreas Marek committed
1019
    integer(kind=ik)        :: n,nb,idx,id,rev,mpicomm
1020
1021
1022
1023

    ! output variables (global)

    ! external functions
Andreas Marek's avatar
Andreas Marek committed
1024
1025
    real(kind=rk), external :: dlapy2
    external                :: dscal
1026
1027

    ! local scalars
Andreas Marek's avatar
Andreas Marek committed
1028
1029
1030
    integer(kind=ik)        :: mpirank,mpirank_top,mpiprocs,mpierr
    real(kind=rk)           :: alpha,dot,beta,xnorm
    integer(kind=ik)        :: local_size,baseoffset,local_offset,top,topidx
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078

    call MPI_Comm_rank(mpicomm, mpirank, mpierr)
    call MPI_Comm_size(mpicomm, mpiprocs, mpierr)

    call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
                                  local_size,baseoffset,local_offset)

    local_offset = local_offset * incx

    ! Processor id for global index of top element
    mpirank_top = MOD((idx-1)/nb,mpiprocs)
    if (mpirank .eq. mpirank_top) then
        topidx = local_index(idx,mpirank_top,mpiprocs,nb,0)
        top = 1+(topidx-1)*incx
    end if

    alpha = seed(id*5+1)
    dot = seed(id*5+2)

    xnorm = sqrt(dot)

    if (xnorm .eq. 0.0d0) then
        ! H = I

        tau = 0.0d0
    else
        ! General case

        beta = sign(dlapy2(alpha, xnorm), alpha)
        tau = (beta+alpha) / beta

        !print *,'hg2',tau,xnorm,alpha

        call dscal(local_size, 1.0d0/(beta+alpha), &
                   x(local_offset), incx)

        ! TODO: reimplement norm rescale method of
        ! original PDLARFG using mpi?

        if (mpirank .eq. mpirank_top) then
            x(top) = -beta
        end if

        seed(8) = beta
    end if
end subroutine qr_pdlarfg2_1dcomm_vector

subroutine qr_pdlarfg2_1dcomm_update(v,incv,baseidx,a,lda,seed,n,idx,nb,rev,mpicomm)
Andreas Marek's avatar
Andreas Marek committed
1079
    use precision
1080
1081
1082
1083
1084
1085
    use ELPA1
    use qr_utils_mod

    implicit none

    ! input variables (local)
Andreas Marek's avatar
Andreas Marek committed
1086
1087
    integer(kind=ik)   :: incv,lda
    real(kind=rk)      :: v(*),a(lda,*),seed(*)
1088
1089

    ! input variables (global)
Andreas Marek's avatar
Andreas Marek committed
1090
    integer(kind=ik)   :: n,baseidx,idx,nb,rev,mpicomm
1091
1092
1093
1094
1095
1096
1097

    ! output variables (global)

    ! external functions
    external daxpy

    ! local scalars
Andreas Marek's avatar
Andreas Marek committed
1098
1099
1100
1101
1102
    integer(kind=ik)   :: mpirank,mpiprocs,mpierr
    integer(kind=ik)   :: local_size,local_offset,baseoffset
    real(kind=rk)      :: z,coeff,beta
    real(kind=rk)      :: dot11,dot12,dot22
    real(kind=rk)      :: top11,top12,top21,top22
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164

    call MPI_Comm_rank(mpicomm, mpirank, mpierr)
    call MPI_Comm_size(mpicomm, mpiprocs, mpierr)


    ! seed should be updated by previous householder generation
    ! Update inner product of this column and next column vector
    top11 = seed(1)
    dot11 = seed(2)
    top12 = seed(3)
    dot12 = seed(4)

    top21 = seed(5)
    top22 = seed(6)
    dot22 = seed(7)
    beta = seed(8)

    call local_size_offset_1d(n,nb,baseidx,idx,rev,mpirank,mpiprocs, &
                              local_size,baseoffset,local_offset)
    baseoffset = baseoffset * incv

    ! zero Householder vector (zero norm) case
    if (beta .eq. 0.0d0) then
        return
    end if
    z = (dot12 + top11 * top12) / beta + top12

    !print *,'hg2 update:',baseidx,idx,mpirank,local_size

    call daxpy(local_size, -z, v(baseoffset),1, a(local_offset,1),1)

    ! prepare a full dot22 for update
    dot22 = dot22 + top22*top22

    ! calculate coefficient
    COEFF = z / (top11 + beta)

    ! update inner product of next vector
    dot22 = dot22 - coeff * (2*dot12 - coeff*dot11)

    ! update dot12 value to represent update with first vector
    ! (needed for T matrix)
    dot12 = dot12 - COEFF * dot11

    ! update top element of next vector
    top22 = top22 - coeff * top21
    seed(6) = top22

    ! restore separated dot22 for vector generation
    seed(7) = dot22  - top22*top22

    !------------------------------------------------------
    ! prepare elements for T matrix
    seed(4) = dot12

    ! prepare dot matrix for fuse element of T matrix
    ! replace top11 value with -beta1
    seed(1) = beta
end subroutine qr_pdlarfg2_1dcomm_update

! run this function after second vector
subroutine qr_pdlarfg2_1dcomm_finalize_tmatrix(seed,tau,t,ldt)
Andreas Marek's avatar
Andreas Marek committed
1165
    use precision
1166
1167
    implicit none

Andreas Marek's avatar
Andreas Marek committed
1168
1169
1170
    integer(kind=ik)  :: ldt
    real(kind=rk)     :: seed(*),t(ldt,*),tau(*)
    real(kind=rk)     :: dot12,beta1,top21,beta2
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187

    beta1 = seed(1)
    dot12 = seed(4)
    top21 = seed(5)
    beta2 = seed(8)

    !print *,'beta1 beta2',beta1,beta2

    dot12 = dot12 / beta2 + top21
    dot12 = -(dot12 / beta1)

    t(1,1) = tau(1)
    t(1,2) = dot12
    t(2,2) = tau(2)
end subroutine qr_pdlarfg2_1dcomm_finalize_tmatrix

subroutine qr_pdlarfgk_1dcomm(a,lda,tau,t,ldt,v,ldv,baseidx,work,lwork,m,k,idx,mb,PQRPARAM,rev,mpicomm,actualk)
Andreas Marek's avatar
Andreas Marek committed
1188
    use precision
1189
1190
1191
1192
1193
    implicit none

    ! parameter setup

    ! input variables (local)
Andreas Marek's avatar
Andreas Marek committed
1194
1195
    integer(kind=ik)    :: lda,lwork,ldv,ldt
    real(kind=rk)       :: a(lda,*),v(ldv,*),tau(*),work(*),t(ldt,*)
1196
1197

    ! input variables (global)
Andreas Marek's avatar
Andreas Marek committed
1198
1199
    integer(kind=ik)    :: m,k,idx,baseidx,mb,rev,mpicomm
    integer(kind=ik)    :: PQRPARAM(*)
1200
1201

    ! output variables (global)
Andreas Marek's avatar
Andreas Marek committed
1202
    integer(kind=ik)    :: actualk
1203
1204

    ! local scalars
Andreas Marek's avatar
Andreas Marek committed
1205
1206
1207
1208
1209
1210
1211
    integer(kind=ik)    :: ivector
    real(kind=rk)       :: pdlarfg_size(1),pdlarf_size(1)
    real(kind=rk)       :: pdlarfgk_1dcomm_seed_size(1),pdlarfgk_1dcomm_check_size(1)
    real(kind=rk)       :: pdlarfgk_1dcomm_update_size(1)
    integer(kind=ik)    :: seedC_size,seedC_offset
    integer(kind=ik)    :: seedD_size,seedD_offset
    integer(kind=ik)    :: work_offset
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260

    seedC_size = k*k
    seedC_offset = 1
    seedD_size = k*k
    seedD_offset = seedC_offset + seedC_size
    work_offset = seedD_offset + seedD_size

    if (lwork .eq. -1) then
        call qr_pdlarfg_1dcomm(a,1,tau(1),pdlarfg_size(1),-1,m,baseidx,mb,PQRPARAM(4),rev,mpicomm)
        call qr_pdlarfl_1dcomm(v,1,baseidx,a,lda,tau(1),pdlarf_size(1),-1,m,k,baseidx,mb,rev,mpicomm)
        call qr_pdlarfgk_1dcomm_seed(a,lda,baseidx,pdlarfgk_1dcomm_seed_size(1),-1,work,work,m,k,mb,mpicomm)
        !call qr_pdlarfgk_1dcomm_check(work,work,k,PQRPARAM,pdlarfgk_1dcomm_check_size(1),-1,actualk)
        call qr_pdlarfgk_1dcomm_check_improved(work,work,k,PQRPARAM,pdlarfgk_1dcomm_check_size(1),-1,actualk)
        call qr_pdlarfgk_1dcomm_update(a,lda,baseidx,pdlarfgk_1dcomm_update_size(1),-1,work,work,k,k,1,work,m,mb,rev,mpicomm)
        work(1) = max(pdlarfg_size(1),pdlarf_size(1),pdlarfgk_1dcomm_seed_size(1),pdlarfgk_1dcomm_check_size(1), &
                      pdlarfgk_1dcomm_update_size(1)) + DBLE(seedC_size + seedD_size);
        return
    end if

        call qr_pdlarfgk_1dcomm_seed(a(1,1),lda,idx,work(work_offset),lwork,work(seedC_offset),work(seedD_offset),m,k,mb,mpicomm)
        !call qr_pdlarfgk_1dcomm_check(work(seedC_offset),work(seedD_offset),k,PQRPARAM,work(work_offset),lwork,actualk)
        call qr_pdlarfgk_1dcomm_check_improved(work(seedC_offset),work(seedD_offset),k,PQRPARAM,work(work_offset),lwork,actualk)

        !print *,'possible rank:', actualk

        ! override useful for debugging
        !actualk = 1
        !actualk = k
        !actualk= min(actualk,2)
        do ivector=1,actualk
            call qr_pdlarfgk_1dcomm_vector(a(1,k-ivector+1),1,idx,tau(k-ivector+1), &
                                            work(seedC_offset),work(seedD_offset),k, &
                                            ivector,m,mb,rev,mpicomm)

            call qr_pdlarfgk_1dcomm_update(a(1,1),lda,idx,work(work_offset),lwork,work(seedC_offset), &
                                            work(seedD_offset),k,actualk,ivector,tau, &
                                            m,mb,rev,mpicomm)

            call qr_pdlarfg_copy_1dcomm(a(1,k-ivector+1),1, &
                                         v(1,k-ivector+1),1, &
                                         m,baseidx,idx-ivector+1,mb,1,mpicomm)
        end do

        ! generate final T matrix and convert preliminary tau values into real ones
        call qr_pdlarfgk_1dcomm_generateT(work(seedC_offset),work(seedD_offset),k,actualk,tau,t,ldt)

end subroutine qr_pdlarfgk_1dcomm

subroutine qr_pdlarfgk_1dcomm_seed(a,lda,baseidx,work,lwork,seedC,seedD,m,k,mb,mpicomm)
Andreas Marek's avatar
Andreas Marek committed
1261
    use precision
1262
1263
1264
1265
1266
1267
1268
1269
    use ELPA1
    use qr_utils_mod

    implicit none

    ! parameter setup

    ! input variables (local)
Andreas Marek's avatar
Andreas Marek committed
1270
1271
    integer(kind=ik)   :: lda,lwork
    real(kind=rk)      :: a(lda,*), work(*)
1272
1273

    ! input variables (global)
Andreas Marek's avatar
Andreas Marek committed
1274
1275
    integer(kind=ik)   :: m,k,baseidx,mb,mpicomm
    real(kind=rk)      :: seedC(k,*),seedD(k,*)
1276
1277
1278
1279
1280
1281

    ! output variables (global)

    ! derived input variables from QR_PQRPARAM

    ! local scalars
Andreas Marek's avatar
Andreas Marek committed
1282
1283
1284
    integer(kind=ik)   :: mpierr,mpirank,mpiprocs,mpirank_top
    integer(kind=ik)   :: icol,irow,lidx,remsize
    integer(kind=ik)   :: remaining_rank
1285

Andreas Marek's avatar
Andreas Marek committed
1286
1287
    integer(kind=ik)   :: C_size,D_size,sendoffset,recvoffset,sendrecv_size
    integer(kind=ik)   :: localoffset,localsize,baseoffset
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373

    call MPI_Comm_rank(mpicomm, mpirank, mpierr)
    call MPI_Comm_size(mpicomm, mpiprocs, mpierr)

    C_size = k*k
    D_size = k*k
    sendoffset = 1
    sendrecv_size = C_size+D_size
    recvoffset = sendoffset + sendrecv_size

    if (lwork .eq. -1) then
        work(1) = DBLE(2*sendrecv_size)
        return
    end if

    ! clear buffer
    work(sendoffset:sendoffset+sendrecv_size-1)=0.0d0

    ! collect C part
    do icol=1,k

        remaining_rank = k
        do while (remaining_rank .gt. 0)
            irow = k - remaining_rank + 1
            lidx = baseidx - remaining_rank + 1

            ! determine chunk where the current top element is located
            mpirank_top = MOD((lidx-1)/mb,mpiprocs)

            ! limit max number of remaining elements of this chunk to the block
            ! distribution parameter
            remsize = min(remaining_rank,mb)

            ! determine the number of needed elements in this chunk
            call local_size_offset_1d(lidx+remsize-1,mb, &
                                      lidx,lidx,0, &
                                      mpirank_top,mpiprocs, &
                                      localsize,baseoffset,localoffset)

            !print *,'local rank',localsize,localoffset

            if (mpirank .eq. mpirank_top) then
                ! copy elements to buffer
                work(sendoffset+(icol-1)*k+irow-1:sendoffset+(icol-1)*k+irow-1+localsize-1) &
                            = a(localoffset:localoffset+remsize-1,icol)
            end if

            ! jump to next chunk
            remaining_rank = remaining_rank - localsize
        end do
    end do

    ! collect D part
	call local_size_offset_1d(m,mb,baseidx-k,baseidx-k,1, &
							  mpirank,mpiprocs, &
							  localsize,baseoffset,localoffset)

    !print *,'localsize',localsize,localoffset
    if (localsize > 0) then
        call dsyrk("Upper", "Trans", k, localsize, &
                   1.0d0, a(localoffset,1), lda, &
                   0.0d0, work(sendoffset+C_size), k)
    else
        work(sendoffset+C_size:sendoffset+C_size+k*k-1) = 0.0d0
    end if

    ! TODO: store symmetric part more efficiently

    ! allreduce operation on results
    call mpi_allreduce(work(sendoffset),work(recvoffset),sendrecv_size, &
                       mpi_real8,mpi_sum,mpicomm,mpierr)

    ! unpack result from buffer into seedC and seedD
    seedC(1:k,1:k) = 0.0d0
    do icol=1,k
        seedC(1:k,icol) = work(recvoffset+(icol-1)*k:recvoffset+icol*k-1)
    end do

    seedD(1:k,1:k) = 0.0d0
    do icol=1,k
        seedD(1:k,icol) = work(recvoffset+C_size+(icol-1)*k:recvoffset+C_size+icol*k-1)
    end do
end subroutine qr_pdlarfgk_1dcomm_seed

! k is assumed to be larger than two
subroutine qr_pdlarfgk_1dcomm_check_improved(seedC,seedD,k,PQRPARAM,work,lwork,possiblerank)
Andreas Marek's avatar
Andreas Marek committed
1374
    use precision
1375
1376
1377
    implicit none

    ! input variables (global)
Andreas Marek's avatar
Andreas Marek committed
1378
1379
1380
    integer(kind=ik)   :: k,lwork
    integer(kind=ik)   :: PQRPARAM(*)
    real(kind=rk)      :: seedC(k,*),seedD(k,*),work(k,*)
1381
1382

    ! output variables (global)
Andreas Marek's avatar
Andreas Marek committed
1383
    integer(kind=ik)   :: possiblerank
1384
1385

    ! derived input variables from QR_PQRPARAM
Andreas Marek's avatar
Andreas Marek committed
1386
    integer(kind=ik)   :: eps
1387
1388

    ! local variables
Andreas Marek's avatar
Andreas Marek committed
1389
1390
1391
    integer(kind=ik)   :: i,j,l
    real(kind=rk)      :: sum_squares,diagonal_square,relative_error,epsd,diagonal_root
    real(kind=rk)      :: dreverse_matrix_work(1)
1392
1393

    ! external functions
Andreas Marek's avatar
Andreas Marek committed
1394
1395
    real(kind=rk), external :: ddot,dlapy2,dnrm2
    external                :: dscal
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481

    if (lwork .eq. -1) then
        call reverse_matrix_local(1,k,k,work,k,dreverse_matrix_work,-1)
        work(1,1) = DBLE(k*k) + dreverse_matrix_work(1)
        return
    end if

    eps = PQRPARAM(3)

    if (eps .eq. 0) then
        possiblerank = k
        return
    end if

    epsd = DBLE(eps)

    ! build complete inner product from seedC and seedD
    ! copy seedD to work
    work(:,1:k) = seedD(:,1:k)

    ! add inner products of seedC to work
    call dsyrk("Upper", "Trans", k, k, &
               1.0d0, seedC(1,1), k, &
               1.0d0, work, k)

	! TODO: optimize this part!
	call reverse_matrix_local(0,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
	call reverse_matrix_local(1,k,k,work(1,1),k,work(1,k+1),lwork-2*k)

    ! transpose matrix
	do i=1,k
	  do j=i+1,k
	    work(i,j) = work(j,i)
	  end do
	end do


    ! do cholesky decomposition
    i = 0
    do while ((i .lt. k))
      i = i + 1

      diagonal_square = abs(work(i,i))
      diagonal_root  = sqrt(diagonal_square)

      ! zero Householder vector (zero norm) case
      if ((abs(diagonal_square) .eq. 0.0d0) .or. (abs(diagonal_root) .eq. 0.0d0)) then
        possiblerank = max(i-1,1)
        return
      end if

      ! check if relative error is bounded for each Householder vector
      ! Householder i is stable iff Househoulder i-1 is "stable" and the accuracy criterion
      ! holds.
      ! first Householder vector is considered as "stable".

      do j=i+1,k
          work(i,j) = work(i,j) / diagonal_root
          do l=i+1,j
              work(l,j) = work(l,j) - work(i,j) * work(i,l)
          end do
      end do
      !print *,'cholesky step done'

      ! build sum of squares
      if(i .eq. 1) then
        sum_squares = 0.0d0
      else
        sum_squares = ddot(i-1,work(1,i),1,work(1,i),1)
      end if
      !relative_error = sum_squares / diagonal_square
      !print *,'error ',i,sum_squares,diagonal_square,relative_error

      if (sum_squares .ge. (epsd * diagonal_square)) then
        possiblerank = max(i-1,1)
        return
      end if
    end do

    possiblerank = i
    !print *,'possible rank', possiblerank
end subroutine qr_pdlarfgk_1dcomm_check_improved

! TODO: zero Householder vector (zero norm) case
! - check alpha values as well (from seedC)
subroutine qr_pdlarfgk_1dcomm_check(seedC,seedD,k,PQRPARAM,work,lwork,possiblerank)
Andreas Marek's avatar
Andreas Marek committed
1482
    use precision
1483
1484
1485
1486
1487
1488
1489
1490
1491
    use qr_utils_mod

    implicit none

    ! parameter setup

    ! input variables (local)

    ! input variables (global)
Andreas Marek's avatar
Andreas Marek committed
1492
1493
1494
    integer(kind=ik)   :: k,lwork
    integer(kind=ik)   :: PQRPARAM(*)
    real(kind=rk)      :: seedC(k,*),seedD(k,*),work(k,*)
1495
1496

    ! output variables (global)
Andreas Marek's avatar
Andreas Marek committed
1497
    integer(kind=ik)   :: possiblerank
1498
1499

    ! derived input variables from QR_PQRPARAM
Andreas Marek's avatar
Andreas Marek committed
1500
    integer(kind=ik)   :: eps
1501
1502

    ! local scalars
Andreas Marek's avatar
Andreas Marek committed
1503
1504
1505
    integer(kind=ik)   :: icol,isqr,iprod
    real(kind=rk)      :: epsd,sum_sqr,sum_products,diff,temp,ortho,ortho_sum
    real(kind=rk)      :: dreverse_matrix_work(1)
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612

    if (lwork .eq. -1) then
        call reverse_matrix_local(1,k,k,work,k,dreverse_matrix_work,-1)
        work(1,1) = DBLE(k*k) + dreverse_matrix_work(1)
        return
    end if

    eps = PQRPARAM(3)

    if (eps .eq. 0) then
        possiblerank = k
        return
    end if

    epsd = DBLE(eps)


    ! copy seedD to work
    work(:,1:k) = seedD(:,1:k)

    ! add inner products of seedC to work
    call dsyrk("Upper", "Trans", k, k, &
               1.0d0, seedC(1,1), k, &
               1.0d0, work, k)

	! TODO: optimize this part!
	call reverse_matrix_local(0,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
	call reverse_matrix_local(1,k,k,work(1,1),k,work(1,k+1),lwork-2*k)

	! transpose matrix
	do icol=1,k
		do isqr=icol+1,k
			work(icol,isqr) = work(isqr,icol)
		end do
	end do

    ! work contains now the full inner product of the global (sub-)matrix
    do icol=1,k
        ! zero Householder vector (zero norm) case
        if (abs(work(icol,icol)) .eq. 0.0d0) then
            !print *,'too small ', icol, work(icol,icol)
            possiblerank = max(icol,1)
            return
        end if

        sum_sqr = 0.0d0
        do isqr=1,icol-1
            sum_products = 0.0d0
            do iprod=1,isqr-1
                sum_products = sum_products + work(iprod,isqr)*work(iprod,icol)
            end do

            !print *,'divisor',icol,isqr,work(isqr,isqr)
            temp = (work(isqr,icol) - sum_products)/work(isqr,isqr)
            work(isqr,icol) = temp
            sum_sqr = sum_sqr + temp*temp
        end do

        ! calculate diagonal value
        diff = work(icol,icol) - sum_sqr
        if (diff .lt. 0.0d0) then
            ! we definitely have a problem now
            possiblerank = icol-1 ! only decompose to previous column (including)
            return
        end if
        work(icol,icol) = sqrt(diff)

        ! calculate orthogonality
        ortho = 0.0d0
        do isqr=1,icol-1
            ortho_sum = 0.0d0
            do iprod=isqr,icol-1
                temp = work(isqr,iprod)*work(isqr,iprod)
                !print *,'ortho ', work(iprod,iprod)
                temp = temp / (work(iprod,iprod)*work(iprod,iprod))
                ortho_sum = ortho_sum + temp
            end do
            ortho = ortho + ortho_sum * (work(isqr,icol)*work(isqr,icol))
        end do

        ! ---------------- with division by zero ----------------------- !

        !ortho = ortho / diff;

        ! if current estimate is not accurate enough, the following check holds
        !if (ortho .gt. epsd) then
        !    possiblerank = icol-1 ! only decompose to previous column (including)
        !    return
        !end if

        ! ---------------- without division by zero ----------------------- !

        ! if current estimate is not accurate enough, the following check holds
        if (ortho .gt. epsd * diff) then
            possiblerank = icol-1 ! only decompose to previous column (including)
            return
        end if
    end do

    ! if we get to this point, the accuracy condition holds for the whole block
    possiblerank = k
end subroutine qr_pdlarfgk_1dcomm_check

!sidx: seed idx
!k: max rank used during seed phase
!rank: actual rank (k >= rank)
subroutine qr_pdlarfgk_1dcomm_vector(x,incx,baseidx,tau,seedC,seedD,k,sidx,n,nb,rev,mpicomm)
Andreas Marek's avatar
Andreas Marek committed
1613
    use precision
1614
1615
1616
1617
1618
1619
    use ELPA1
    use qr_utils_mod

    implicit none

    ! input variables (local)
Andreas Marek's avatar
Andreas Marek committed
1620
1621
    integer(kind=ik)  :: incx
    real(kind=rk)     :: x(*),tau
1622
1623

    ! input variables (global)
Andreas Marek's avatar
Andreas Marek committed
1624
1625
    integer(kind=ik)  :: n,nb,baseidx,rev,mpicomm,k,sidx
    real(kind=rk)     :: seedC(k,*),seedD(k,*)
1626
1627
1628
1629

    ! output variables (global)

    ! external functions
Andreas Marek's avatar
Andreas Marek committed
1630
1631
    real(kind=rk), external :: dlapy2,dnrm2
    external                :: dscal
1632
1633

    ! local scalars
Andreas Marek's avatar
Andreas Marek committed
1634
1635
1636
1637
    integer(kind=ik)   :: mpirank,mpirank_top,mpiprocs,mpierr
    real(kind=rk)      :: alpha,dot,beta,xnorm
    integer(kind=ik)   :: local_size,baseoffset,local_offset,top,topidx
    integer(kind=ik)   :: lidx
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691

    call MPI_Comm_rank(mpicomm, mpirank, mpierr)
    call MPI_Comm_size(mpicomm, mpiprocs, mpierr)

	lidx = baseidx-sidx+1
	call local_size_offset_1d(n,nb,baseidx,lidx-1,rev,mpirank,mpiprocs, &
							  local_size,baseoffset,local_offset)

    local_offset = local_offset * incx

    ! Processor id for global index of top element
    mpirank_top = MOD((lidx-1)/nb,mpiprocs)
    if (mpirank .eq. mpirank_top) then
        topidx = local_index((lidx),mpirank_top,mpiprocs,nb,0)
        top = 1+(topidx-1)*incx
    end if

	alpha = seedC(k-sidx+1,k-sidx+1)
	dot = seedD(k-sidx+1,k-sidx+1)
	! assemble actual norm from both seed parts
	xnorm = dlapy2(sqrt(dot), dnrm2(k-sidx,seedC(1,k-sidx+1),1))

    if (xnorm .eq. 0.0d0) then
        tau = 0.0d0
    else
        ! General case

        beta = sign(dlapy2(alpha, xnorm), alpha)
        ! store a preliminary version of beta in tau
        tau = beta

        ! update global part
        call dscal(local_size, 1.0d0/(beta+alpha), &
                   x(local_offset), incx)

        ! do not update local part here due to
        ! dependency of c vector during update process

        ! TODO: reimplement norm rescale method of
        ! original PDLARFG using mpi?

        if (mpirank .eq. mpirank_top) then
            x(top) = -beta
        end if
    end if

end subroutine qr_pdlarfgk_1dcomm_vector

!k: original max rank used during seed function
!rank: possible rank as from check function
! TODO: if rank is less than k, reduce buffersize in such a way
! that only the required entries for the next pdlarfg steps are
! computed
subroutine qr_pdlarfgk_1dcomm_update(a,lda,baseidx,work,lwork,seedC,seedD,k,rank,sidx,tau,n,nb,rev,mpicomm)
Andreas Marek's avatar
Andreas Marek committed
1692
    use precision
1693
1694
1695
1696
1697
1698
    use ELPA1
    use qr_utils_mod

    implicit none

    ! parameter setup
Andreas Marek's avatar
Andreas Marek committed
1699
    INTEGER(kind=ik), parameter :: gmode_ = 1,rank_ = 2,eps_ = 3, upmode1_ = 4
1700
1701

    ! input variables (local)
Andreas Marek's avatar
Andreas Marek committed
1702
1703
    integer(kind=ik)            :: lda,lwork
    real(kind=rk)               :: a(lda,*),work(*)
1704
1705

    ! input variables (global)
Andreas Marek's avatar
Andreas Marek committed
1706
1707
    integer(kind=ik)            :: k,rank,sidx,n,baseidx,nb,rev,mpicomm
    real(kind=rk)               :: beta
1708
1709

    ! output variables (global)
Andreas Marek's avatar
Andreas Marek committed
1710
    real(kind=rk)               :: seedC(k,*),seedD(k,*),tau(*)