elpa_pdgeqrf.f90 60.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
module elpa_pdgeqrf

    use elpa1
    use elpa_pdlarfb
    use tum_utils
 
    implicit none

    PRIVATE

    public :: tum_pdgeqrf_2dcomm
    public :: tum_pqrparam_init
    public :: tum_pdlarfg2_1dcomm_check
    
    include 'mpif.h'

contains
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

subroutine tum_pdgeqrf_2dcomm(a,lda,v,ldv,tau,t,ldt,work,lwork,m,n,mb,nb,rowidx,colidx,rev,trans,PQRPARAM,mpicomm_rows,mpicomm_cols,blockheuristic)
    use ELPA1
    use tum_utils
  
    implicit none
 
    ! parameter setup
    INTEGER     gmode_,rank_,eps_
    PARAMETER   (gmode_ = 1,rank_ = 2,eps_=3)

    ! input variables (local)
    integer lda,lwork,ldv,ldt
    double precision a(lda,*),v(ldv,*),tau(*),work(*),t(ldt,*)

    ! input variables (global)
    integer m,n,mb,nb,rowidx,colidx,rev,trans,mpicomm_cols,mpicomm_rows
    integer PQRPARAM(*)
 
    ! output variables (global)
    double precision blockheuristic(*)

    ! input variables derived from PQRPARAM
    integer updatemode,tmerge,size2d

    ! local scalars
    integer mpierr,mpirank_cols,broadcast_size,mpirank_rows
    integer mpirank_cols_qr,mpiprocs_cols
    integer lcols_temp,lcols,icol,lastcol
    integer baseoffset,offset,idx,voffset
    integer update_voffset,update_tauoffset
    integer update_lcols,ivector
    integer work_offset

    double precision dbroadcast_size(1),dtmat_bcast_size(1)
    double precision pdgeqrf_size(1),pdlarft_size(1),pdlarfb_size(1),tmerge_pdlarfb_size(1)
    integer temptau_offset,temptau_size,broadcast_offset,tmat_bcast_size
    integer remaining_cols
    integer total_cols
    integer incremental_update_start,incremental_update_size ! needed for incremental update mode
 
    size2d = PQRPARAM(1)
    updatemode = PQRPARAM(2)
    tmerge = PQRPARAM(3)

    ! copy value before we are going to filter it
    total_cols = n

    call mpi_comm_rank(mpicomm_cols,mpirank_cols,mpierr)
    call mpi_comm_rank(mpicomm_rows,mpirank_rows,mpierr)
    call mpi_comm_size(mpicomm_cols,mpiprocs_cols,mpierr)
  
   
    call tum_pdgeqrf_1dcomm(a,lda,v,ldv,tau,t,ldt,pdgeqrf_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,trans,PQRPARAM(4),mpicomm_rows,blockheuristic)
    call tum_pdgeqrf_pack_unpack(v,ldv,dbroadcast_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,0,mpicomm_rows)
    call tum_pdgeqrf_pack_unpack_tmatrix(tau,t,ldt,dtmat_bcast_size(1),-1,total_cols,0)
    pdlarft_size(1) = 0.0d0
    call tum_pdlarfb_1dcomm(m,mb,total_cols,total_cols,a,lda,v,ldv,tau,t,ldt,rowidx,rowidx,rev,mpicomm_rows,pdlarfb_size(1),-1)
    call tum_tmerge_pdlarfb_1dcomm(m,mb,total_cols,total_cols,total_cols,v,ldv,work,t,ldt,a,lda,rowidx,rowidx,rev,updatemode,mpicomm_rows,tmerge_pdlarfb_size(1),-1)


    temptau_offset = 1
    temptau_size = total_cols
    broadcast_offset = temptau_offset + temptau_size 
    broadcast_size = dbroadcast_size(1) + dtmat_bcast_size(1)
    work_offset = broadcast_offset + broadcast_size

    if (lwork .eq. -1) then
        work(1) = (DBLE(temptau_size) + DBLE(broadcast_size) + max(pdgeqrf_size(1),pdlarft_size(1),pdlarfb_size(1),tmerge_pdlarfb_size(1)))
        return
    end if

90
91
    lastcol = colidx-total_cols+1
    voffset = total_cols
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
  
    incremental_update_size = 0
 
    ! clear v buffer: just ensure that there is no junk in the upper triangle
    ! part, otherwise pdlarfb gets some problems
    ! pdlarfl(2) do not have these problems as they are working more on a vector
    ! basis
    v(1:ldv,1:total_cols) = 0.0d0
 
    icol = colidx

    remaining_cols = total_cols

    !print *,'start decomposition',m,rowidx,colidx
 
    do while (remaining_cols .gt. 0)

        ! determine rank of process column with next qr block
        mpirank_cols_qr = MOD((icol-1)/nb,mpiprocs_cols)

        ! lcols can't be larger than than nb
        ! exception: there is only one process column
 
        ! however, we might not start at the first local column.
        ! therefore assume a matrix of size (1xlcols) starting at (1,icol)
        ! determine the real amount of local columns
118
        lcols_temp = min(nb,(icol-lastcol+1))
119

120
121
        ! blocking parameter
        lcols_temp = max(min(lcols_temp,size2d),1)
122

123
124
125
        ! determine size from last decomposition column
        !  to first decomposition column
        call local_size_offset_1d(icol,nb,icol-lcols_temp+1,icol-lcols_temp+1,0, &
126
127
128
                                      mpirank_cols_qr,mpiprocs_cols, &
                                      lcols,baseoffset,offset)
 
129
        voffset = remaining_cols - lcols + 1
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173

        idx = rowidx - colidx + icol

        if (mpirank_cols .eq. mpirank_cols_qr) then 
            ! qr decomposition part

            tau(offset:offset+lcols-1) = 0.0d0

            call tum_pdgeqrf_1dcomm(a(1,offset),lda,v(1,voffset),ldv,tau(offset),t(voffset,voffset),ldt,work(work_offset),lwork,m,lcols,mb,rowidx,idx,rev,trans,PQRPARAM(4),mpicomm_rows,blockheuristic)
            !print *,'offset voffset',offset,voffset,idx
    
            ! pack broadcast buffer (v + tau)
            call tum_pdgeqrf_pack_unpack(v(1,voffset),ldv,work(broadcast_offset),lwork,m,lcols,mb,rowidx,idx,rev,0,mpicomm_rows)
     
            ! determine broadcast size
            call tum_pdgeqrf_pack_unpack(v(1,voffset),ldv,dbroadcast_size(1),-1,m,lcols,mb,rowidx,idx,rev,0,mpicomm_rows)
            broadcast_size = dbroadcast_size(1)
  
            !if (mpirank_rows .eq. 0) then
            ! pack tmatrix into broadcast buffer and calculate new size
            call tum_pdgeqrf_pack_unpack_tmatrix(tau(offset),t(voffset,voffset),ldt,work(broadcast_offset+broadcast_size),lwork,lcols,0)
            call tum_pdgeqrf_pack_unpack_tmatrix(tau(offset),t(voffset,voffset),ldt,dtmat_bcast_size(1),-1,lcols,0)
            broadcast_size = broadcast_size + dtmat_bcast_size(1)
            !end if
 
            ! initiate broadcast (send part)
            call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real8, &
                           mpirank_cols_qr,mpicomm_cols,mpierr)

            ! copy tau parts into temporary tau buffer
            work(temptau_offset+voffset-1:temptau_offset+(voffset-1)+lcols-1) = tau(offset:offset+lcols-1)

            !print *,'generated tau:', tau(offset)
        else
            ! vector exchange part

            ! determine broadcast size
            call tum_pdgeqrf_pack_unpack(v(1,voffset),ldv,dbroadcast_size(1),-1,m,lcols,mb,rowidx,idx,rev,1,mpicomm_rows)
            broadcast_size = dbroadcast_size(1)
 
            call tum_pdgeqrf_pack_unpack_tmatrix(work(temptau_offset+voffset-1),t(voffset,voffset),ldt,dtmat_bcast_size(1),-1,lcols,0)
            tmat_bcast_size = dtmat_bcast_size(1)

            !print *,'broadcast_size (nonqr)',broadcast_size
174
            broadcast_size = dbroadcast_size(1) + dtmat_bcast_size(1)
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
 
            ! initiate broadcast (recv part)
            call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real8, &
                           mpirank_cols_qr,mpicomm_cols,mpierr)
            
            ! last n*n elements in buffer are (still empty) T matrix elements
            ! fetch from first process in each column
 
            ! unpack broadcast buffer (v + tau)
            call tum_pdgeqrf_pack_unpack(v(1,voffset),ldv,work(broadcast_offset),lwork,m,lcols,mb,rowidx,idx,rev,1,mpicomm_rows)
 
            ! now send t matrix to other processes in our process column
            broadcast_size = dbroadcast_size(1)
            tmat_bcast_size = dtmat_bcast_size(1)

            ! t matrix should now be available on all processes => unpack
            call tum_pdgeqrf_pack_unpack_tmatrix(work(temptau_offset+voffset-1),t(voffset,voffset),ldt,work(broadcast_offset+broadcast_size),lwork,lcols,1)
        end if

        remaining_cols = remaining_cols - lcols
 
        ! apply householder vectors to whole trailing matrix parts (if any)

        update_voffset = voffset
        update_tauoffset = icol
        update_lcols = lcols
        incremental_update_size = incremental_update_size + lcols
 
203
204
205
        icol = icol - lcols
        ! count colums from first column of global block to current index
        call local_size_offset_1d(icol,nb,colidx-n+1,colidx-n+1,0, &
206
207
208
209
                                      mpirank_cols,mpiprocs_cols, &
                                      lcols,baseoffset,offset)

        if (lcols .gt. 0) then
210

211
212
            !print *,'updating trailing matrix'

213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
			if (updatemode .eq. ichar('I')) then
				print *,'pdgeqrf_2dcomm: incremental update not yet implemented! rev=1'
			else if (updatemode .eq. ichar('F')) then
				! full update no merging
				call tum_pdlarfb_1dcomm(m,mb,lcols,update_lcols,a(1,offset),lda,v(1,update_voffset),ldv, &
							work(temptau_offset+update_voffset-1),t(update_voffset,update_voffset),ldt, &
							rowidx,idx,1,mpicomm_rows,work(work_offset),lwork)
			else 
				! full update + merging default
				call tum_tmerge_pdlarfb_1dcomm(m,mb,lcols,n-(update_voffset+update_lcols-1),update_lcols,v(1,update_voffset),ldv, &
							   work(temptau_offset+update_voffset-1),t(update_voffset,update_voffset),ldt, &
							   a(1,offset),lda,rowidx,idx,1,updatemode,mpicomm_rows,work(work_offset),lwork)
			end if
        else
			if (updatemode .eq. ichar('I')) then
				print *,'sole merging of (incremental) T matrix', mpirank_cols, n-(update_voffset+incremental_update_size-1)
				call tum_tmerge_pdlarfb_1dcomm(m,mb,0,n-(update_voffset+incremental_update_size-1),incremental_update_size,v(1,update_voffset),ldv, &
											   work(temptau_offset+update_voffset-1),t(update_voffset,update_voffset),ldt, &
											   a,lda,rowidx,idx,1,updatemode,mpicomm_rows,work(work_offset),lwork)

				! reset for upcoming incremental updates
				incremental_update_size = 0
			else if (updatemode .eq. ichar('M')) then
				! final merge
				call tum_tmerge_pdlarfb_1dcomm(m,mb,0,n-(update_voffset+update_lcols-1),update_lcols,v(1,update_voffset),ldv, &
											   work(temptau_offset+update_voffset-1),t(update_voffset,update_voffset),ldt, &
											   a,lda,rowidx,idx,1,updatemode,mpicomm_rows,work(work_offset),lwork)
			else 
				! full updatemode - nothing to update
			end if

			! reset for upcoming incremental updates
			incremental_update_size = 0
246
247
248
249
250
251
252
253
        end if
    end do
 
    if ((tmerge .gt. 0) .and. (updatemode .eq. ichar('F'))) then
        ! finally merge all small T parts
        call tum_pdlarft_tree_merge_1dcomm(m,mb,n,size2d,tmerge,v,ldv,tau,t,ldt,rowidx,rowidx,rev,mpicomm_rows,work,lwork)
    end if

254
    !print *,'stop decomposition',rowidx,colidx
255
 
256
end subroutine tum_pdgeqrf_2dcomm
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363

subroutine tum_pdgeqrf_1dcomm(a,lda,v,ldv,tau,t,ldt,work,lwork,m,n,mb,baseidx,rowidx,rev,trans,PQRPARAM,mpicomm,blockheuristic)
    use ELPA1
  
    implicit none
 
    ! parameter setup
    INTEGER     gmode_,rank_,eps_
    PARAMETER   (gmode_ = 1,rank_ = 2,eps_=3)

    ! input variables (local)
    integer lda,lwork,ldv,ldt
    double precision a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(*)

    ! input variables (global)
    integer m,n,mb,baseidx,rowidx,rev,trans,mpicomm
    integer PQRPARAM(*)

    ! derived input variables
  
    ! derived further input variables from TUM_PQRPARAM
    integer size1d,updatemode,tmerge

    ! output variables (global)
    double precision blockheuristic(*)
 
    ! local scalars
    integer nr_blocks,remainder,current_block,aoffset,idx,updatesize
    double precision pdgeqr2_size(1),pdlarfb_size(1),tmerge_tree_size(1)

    size1d = max(min(PQRPARAM(1),n),1)
    updatemode = PQRPARAM(2)
    tmerge = PQRPARAM(3)

    if (lwork .eq. -1) then
        call tum_pdgeqr2_1dcomm(a,lda,v,ldv,tau,t,ldt,pdgeqr2_size,-1, & 
                                m,size1d,mb,baseidx,baseidx,rev,trans,PQRPARAM(4),mpicomm,blockheuristic)

        ! reserve more space for incremental mode
        call tum_tmerge_pdlarfb_1dcomm(m,mb,n,n,n,v,ldv,tau,t,ldt, &
                                       a,lda,baseidx,baseidx,rev,updatemode,mpicomm,pdlarfb_size,-1)
 
        call tum_pdlarft_tree_merge_1dcomm(m,mb,n,size1d,tmerge,v,ldv,tau,t,ldt,baseidx,baseidx,rev,mpicomm,tmerge_tree_size,-1)

        work(1) = max(pdlarfb_size(1),pdgeqr2_size(1),tmerge_tree_size(1))
        return
    end if

        nr_blocks = n / size1d
        remainder = n - nr_blocks*size1d
  
        current_block = 0
        do while (current_block .lt. nr_blocks)
            idx = rowidx-current_block*size1d
            updatesize = n-(current_block+1)*size1d
            aoffset = 1+updatesize

            call tum_pdgeqr2_1dcomm(a(1,aoffset),lda,v(1,aoffset),ldv,tau(aoffset),t(aoffset,aoffset),ldt,work,lwork, & 
                                    m,size1d,mb,baseidx,idx,1,trans,PQRPARAM(4),mpicomm,blockheuristic)

            if (updatemode .eq. ichar('M')) then
                ! full update + merging
                call tum_tmerge_pdlarfb_1dcomm(m,mb,updatesize,current_block*size1d,size1d, & 
                                               v(1,aoffset),ldv,tau(aoffset),t(aoffset,aoffset),ldt, &
                                               a,lda,baseidx,idx,1,ichar('F'),mpicomm,work,lwork)
            else if (updatemode .eq. ichar('I')) then
                if (updatesize .ge. size1d) then
                    ! incremental update + merging
                    call tum_tmerge_pdlarfb_1dcomm(m,mb,size1d,current_block*size1d,size1d, & 
                                                   v(1,aoffset),ldv,tau(aoffset),t(aoffset,aoffset),ldt, &
                                                   a(1,aoffset-size1d),lda,baseidx,idx,1,updatemode,mpicomm,work,lwork)

                else ! only remainder left
                    ! incremental update + merging
                    call tum_tmerge_pdlarfb_1dcomm(m,mb,remainder,current_block*size1d,size1d, & 
                                                   v(1,aoffset),ldv,tau(aoffset),t(aoffset,aoffset),ldt, &
                                                   a(1,1),lda,baseidx,idx,1,updatemode,mpicomm,work,lwork)
                end if
            else ! full update no merging is default
                ! full update no merging
                call tum_pdlarfb_1dcomm(m,mb,updatesize,size1d,a,lda,v(1,aoffset),ldv, &
                                        tau(aoffset),t(aoffset,aoffset),ldt,baseidx,idx,1,mpicomm,work,lwork)
            end if

            ! move on to next block
            current_block = current_block+1
        end do

        if (remainder .gt. 0) then
            aoffset = 1
            idx = rowidx-size1d*nr_blocks
            call tum_pdgeqr2_1dcomm(a(1,aoffset),lda,v,ldv,tau,t,ldt,work,lwork, &
                                    m,remainder,mb,baseidx,idx,1,trans,PQRPARAM(4),mpicomm,blockheuristic)

            if ((updatemode .eq. ichar('I')) .or. (updatemode .eq. ichar('M'))) then
                ! final merging
                call tum_tmerge_pdlarfb_1dcomm(m,mb,0,size1d*nr_blocks,remainder, & 
                                               v,ldv,tau,t,ldt, &
                                               a,lda,baseidx,idx,1,updatemode,mpicomm,work,lwork) ! updatemode argument does not matter
            end if
        end if
 
    if ((tmerge .gt. 0) .and. (updatemode .eq. ichar('F'))) then
        ! finally merge all small T parts
        call tum_pdlarft_tree_merge_1dcomm(m,mb,n,size1d,tmerge,v,ldv,tau,t,ldt,baseidx,idx,rev,mpicomm,work,lwork)
    end if

364
365
end subroutine tum_pdgeqrf_1dcomm

366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
! local a and tau are assumed to be positioned at the right column from a local
! perspective
! TODO: if local amount of data turns to zero the algorithm might produce wrong
! results (probably due to old buffer contents)
subroutine tum_pdgeqr2_1dcomm(a,lda,v,ldv,tau,t,ldt,work,lwork,m,n,mb,baseidx,rowidx,rev,trans,PQRPARAM,mpicomm,blockheuristic)
    use ELPA1
  
    implicit none
 
    ! parameter setup
    INTEGER     gmode_,rank_,eps_,upmode1_
    PARAMETER   (gmode_ = 1,rank_ = 2,eps_=3, upmode1_=4)

    ! input variables (local)
    integer lda,lwork,ldv,ldt
    double precision a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(*)

    ! input variables (global)
    integer m,n,mb,baseidx,rowidx,rev,trans,mpicomm
    integer PQRPARAM(*)
 
    ! output variables (global)
    double precision blockheuristic(*)
 
    ! derived further input variables from TUM_PQRPARAM
    integer maxrank,hgmode,updatemode

    ! local scalars
    integer icol,incx,idx,topidx,top,ivector,irank
    double precision pdlarfg_size(1),pdlarf_size(1),topbak,temp_size(1),total_size
    double precision pdlarfg2_size(1),pdlarfgk_size(1),pdlarfl2_size(1)
    double precision pdlarft_size(1),pdlarfb_size(1),pdlarft_pdlarfb_size(1),tmerge_pdlarfb_size(1)
    integer mpirank,mpiprocs,mpierr,mpirank_top
    integer rank,lastcol,actualrank,nextrank
    integer update_cols,decomposition_cols
    integer current_column
 
    maxrank = min(PQRPARAM(1),n)
    updatemode = PQRPARAM(2)
    hgmode = PQRPARAM(4)

    call MPI_Comm_rank(mpicomm, mpirank, mpierr)
    call MPI_Comm_size(mpicomm, mpiprocs, mpierr)

    if (trans .eq. 1) then
        incx = lda
    else
        incx = 1
    end if
    
    if (lwork .eq. -1) then
        call tum_pdlarfg_1dcomm(a,incx,tau(1),pdlarfg_size(1),-1,n,rowidx,mb,hgmode,rev,mpicomm)
        call tum_pdlarfl_1dcomm(v,1,baseidx,a,lda,tau(1),pdlarf_size(1),-1,m,n,rowidx,mb,rev,mpicomm)
        call tum_pdlarfg2_1dcomm_ref(a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfg2_size(1),-1,m,n,rowidx,mb,PQRPARAM,rev,mpicomm,actualrank)
        call tum_pdlarfgk_1dcomm(a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfgk_size(1),-1,m,n,rowidx,mb,PQRPARAM,rev,mpicomm,actualrank)
        call tum_pdlarfl2_tmatrix_1dcomm(v,ldv,baseidx,a,lda,t,ldt,pdlarfl2_size(1),-1,m,n,rowidx,mb,rev,mpicomm)
        pdlarft_size(1) = 0.0d0
        call tum_pdlarfb_1dcomm(m,mb,n,n,a,lda,v,ldv,tau,t,ldt,baseidx,rowidx,1,mpicomm,pdlarfb_size(1),-1)
        pdlarft_pdlarfb_size(1) = 0.0d0
        call tum_tmerge_pdlarfb_1dcomm(m,mb,n,n,n,v,ldv,work,t,ldt,a,lda,rowidx,idx,rev,updatemode,mpicomm,tmerge_pdlarfb_size(1),-1)

        total_size = max(pdlarfg_size(1),pdlarf_size(1),pdlarfg2_size(1),pdlarfgk_size(1),pdlarfl2_size(1),pdlarft_size(1),pdlarfb_size(1),pdlarft_pdlarfb_size(1),tmerge_pdlarfb_size(1))

        work(1) = total_size
        return
    end if

        icol = 1
        lastcol = min(rowidx,n)
        decomposition_cols = lastcol
        update_cols = n
        do while (decomposition_cols .gt. 0) ! local qr block
            icol = lastcol-decomposition_cols+1
            idx = rowidx-icol+1

            ! get possible rank size
            ! limited by number of columns and remaining rows
            rank = min(n-icol+1,maxrank,idx)

            current_column = n-icol+1-rank+1

            if (rank .eq. 1) then
448

449
450
451
452
453
454
455
456
457
458
459
460
461
                call tum_pdlarfg_1dcomm(a(1,current_column),incx, &
                                        tau(current_column),work,lwork, &
                                        m,idx,mb,hgmode,1,mpicomm)

                v(1:ldv,current_column) = 0.0d0
                call tum_pdlarfg_copy_1dcomm(a(1,current_column),incx, &
                                             v(1,current_column),1, &
                                             m,baseidx,idx,mb,1,mpicomm)

                ! initialize t matrix part
                t(current_column,current_column) = tau(current_column)

                actualrank = 1
462

463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
            else if (rank .eq. 2) then 
                call tum_pdlarfg2_1dcomm_ref(a(1,current_column),lda,tau(current_column), &
                                             t(current_column,current_column),ldt,v(1,current_column),ldv, &
                                            baseidx,work,lwork,m,rank,idx,mb,PQRPARAM,1,mpicomm,actualrank)
            
            else 
                call tum_pdlarfgk_1dcomm(a(1,current_column),lda,tau(current_column), &
                                         t(current_column,current_column),ldt,v(1,current_column),ldv, &
                                         baseidx,work,lwork,m,rank,idx,mb,PQRPARAM,1,mpicomm,actualrank)
 
            end if
  
            blockheuristic(actualrank) = blockheuristic(actualrank) + 1

            ! the blocked decomposition versions already updated their non
            ! decomposed parts using their information after communication
            update_cols = decomposition_cols - rank
            decomposition_cols = decomposition_cols - actualrank
 
            ! needed for incremental update
            nextrank = min(n-(lastcol-decomposition_cols+1)+1,maxrank,rowidx-(lastcol-decomposition_cols+1)+1)

            if (current_column .gt. 1) then
                idx = rowidx-icol+1
 
                if (updatemode .eq. ichar('I')) then
                    ! incremental update + merging
                    call tum_tmerge_pdlarfb_1dcomm(m,mb,nextrank-(rank-actualrank),n-(current_column+rank-1),actualrank,v(1,current_column+(rank-actualrank)),ldv,tau(current_column+(rank-actualrank)), &
                                                   t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
                                                   a(1,current_column-nextrank+(rank-actualrank)),lda,baseidx,idx,rev,updatemode,mpicomm,work,lwork)
                else
                    ! full update + merging
                    call tum_tmerge_pdlarfb_1dcomm(m,mb,update_cols,n-(current_column+rank-1),actualrank,v(1,current_column+(rank-actualrank)),ldv,tau(current_column+(rank-actualrank)), &
                                                   t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
                                                   a(1,1),lda,baseidx,idx,rev,updatemode,mpicomm,work,lwork)
                end if
            else
                call tum_tmerge_pdlarfb_1dcomm(m,mb,0,n-(current_column+rank-1),actualrank,v(1,current_column+(rank-actualrank)),ldv,tau(current_column+(rank-actualrank)), &
                                               t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
                                               a,lda,baseidx,idx,rev,updatemode,mpicomm,work,lwork)
            end if

        end do
506
507
end subroutine tum_pdgeqr2_1dcomm

508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
! incx == 1: column major
! incx != 1: row major
subroutine tum_pdlarfg_1dcomm(x,incx,tau,work,lwork,n,idx,nb,hgmode,rev,mpi_comm)
    use ELPA1
    use tum_utils
    
    implicit none
 
    ! parameter setup
    INTEGER     gmode_,rank_,eps_
    PARAMETER   (gmode_ = 1,rank_ = 2,eps_=3)

    ! input variables (local)
    integer incx,lwork,hgmode
    double precision x(*),work(*)

    ! input variables (global)
    integer mpi_comm,nb,idx,n,rev

    ! output variables (global)
    double precision tau

    ! local scalars
    integer mpierr,mpirank,mpiprocs,mpirank_top
    integer sendsize,recvsize
    integer local_size,local_offset,baseoffset
    integer topidx,top,iproc
    double precision alpha,beta,xnorm,dot,xf

    ! external functions
    double precision ddot,dlapy2,dnrm2
    external ddot,dscal,dlapy2,dnrm2

    ! intrinsic
    intrinsic sign

544
545
546
547
	if (idx .le. 1) then
		tau = 0.0d0
		return
	end if
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582

    call MPI_Comm_rank(mpi_comm, mpirank, mpierr)
    call MPI_Comm_size(mpi_comm, mpiprocs, mpierr)

    ! calculate expected work size and store in work(1)
    if (hgmode .eq. ichar('s')) then
        ! allreduce (MPI_SUM)
        sendsize = 2
        recvsize = sendsize
    else if (hgmode .eq. ichar('x')) then
        ! alltoall
        sendsize = mpiprocs*2
        recvsize = sendsize
    else if (hgmode .eq. ichar('g')) then
        ! allgather
        sendsize = 2
        recvsize = mpiprocs*sendsize
    else
        ! no exchange at all (benchmarking)
        sendsize = 2
        recvsize = sendsize
    end if

    if (lwork .eq. -1) then
        work(1) = DBLE(sendsize + recvsize)
        return
    end if
 
    ! Processor id for global index of top element
    mpirank_top = MOD((idx-1)/nb,mpiprocs)
    if (mpirank .eq. mpirank_top) then
        topidx = local_index(idx,mpirank_top,mpiprocs,nb,0)
        top = 1+(topidx-1)*incx
    end if
 
583
584
	call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
							  local_size,baseoffset,local_offset)
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703

    local_offset = local_offset * incx

    ! calculate and exchange information
    if (hgmode .eq. ichar('s')) then
        if (mpirank .eq. mpirank_top) then
            alpha = x(top)
        else 
            alpha = 0.0d0
        end if

        dot = ddot(local_size, &
                   x(local_offset), incx, &
                   x(local_offset), incx)

        work(1) = alpha
        work(2) = dot
        
        call mpi_allreduce(work(1),work(sendsize+1), &
                           sendsize,mpi_real8,mpi_sum, &
                           mpi_comm,mpierr)

        alpha = work(sendsize+1)
        xnorm = sqrt(work(sendsize+2))
    else if (hgmode .eq. ichar('x')) then
        if (mpirank .eq. mpirank_top) then
            alpha = x(top)
        else 
            alpha = 0.0d0
        end if

        xnorm = dnrm2(local_size, x(local_offset), incx)

        do iproc=0,mpiprocs-1
            work(2*iproc+1) = alpha
            work(2*iproc+2) = xnorm
        end do

        call mpi_alltoall(work(1),2,mpi_real8, &
                          work(sendsize+1),2,mpi_real8, &
                          mpi_comm,mpierr)

        ! extract alpha value
        alpha = work(sendsize+1+mpirank_top*2)

        ! copy norm parts of buffer to beginning
        do iproc=0,mpiprocs-1
            work(iproc+1) = work(sendsize+1+2*iproc+1)
        end do

        xnorm = dnrm2(mpiprocs, work(1), 1)
    else if (hgmode .eq. ichar('g')) then
        if (mpirank .eq. mpirank_top) then
            alpha = x(top)
        else 
            alpha = 0.0d0
        end if

        xnorm = dnrm2(local_size, x(local_offset), incx)
        work(1) = alpha
        work(2) = xnorm

        ! allgather
        call mpi_allgather(work(1),sendsize,mpi_real8, &
                          work(sendsize+1),sendsize,mpi_real8, &
                          mpi_comm,mpierr)

        ! extract alpha value
        alpha = work(sendsize+1+mpirank_top*2)
 
        ! copy norm parts of buffer to beginning
        do iproc=0,mpiprocs-1
            work(iproc+1) = work(sendsize+1+2*iproc+1)
        end do

        xnorm = dnrm2(mpiprocs, work(1), 1)
    else
        ! dnrm2
        xnorm = dnrm2(local_size, x(local_offset), incx)

        if (mpirank .eq. mpirank_top) then
            alpha = x(top)
        else 
            alpha = 0.0d0
        end if

        ! no exchange at all (benchmarking)
 
        xnorm = 0.0d0
    end if

    !print *,'ref hg:', idx,xnorm,alpha
    !print *,x(1:n)

    ! calculate householder information
    if (xnorm .eq. 0.0d0) then
        ! H = I

        tau = 0.0d0
    else
        ! General case

        call hh_transform_real(alpha,xnorm**2,xf,tau)
        if (mpirank .eq. mpirank_top) then
            x(top) = alpha
        end if

        call dscal(local_size, xf, &
                   x(local_offset), incx)
 
        ! TODO: reimplement norm rescale method of 
        ! original PDLARFG using mpi?

    end if

    ! useful for debugging
    !print *,'hg:mpirank,idx,beta,alpha:',mpirank,idx,beta,alpha,1.0d0/(beta+alpha),tau
    !print *,x(1:n)

704
705
end subroutine tum_pdlarfg_1dcomm

706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
subroutine tum_pdlarfg2_1dcomm_ref(a,lda,tau,t,ldt,v,ldv,baseidx,work,lwork,m,k,idx,mb,PQRPARAM,rev,mpicomm,actualk)
    implicit none
 
    ! parameter setup
    INTEGER     gmode_,rank_,eps_,upmode1_
    PARAMETER   (gmode_ = 1,rank_ = 2,eps_=3, upmode1_=4)

    ! input variables (local)
    integer lda,lwork,ldv,ldt
    double precision a(lda,*),v(ldv,*),tau(*),work(*),t(ldt,*)

    ! input variables (global)
    integer m,k,idx,baseidx,mb,rev,mpicomm
    integer PQRPARAM(*)
 
    ! output variables (global)
    integer actualk
 
    ! derived input variables from TUM_PQRPARAM
    integer eps

    ! local scalars
    integer irank,ivector
    double precision dseedwork_size(1),dseed_size
    integer seedwork_size,seed_size
    integer seedwork_offset,seed_offset
    logical accurate

    call tum_pdlarfg2_1dcomm_seed(a,lda,dseedwork_size(1),-1,work,m,mb,idx,rev,mpicomm)
    seedwork_size = dseedwork_size(1)
    seed_size = seedwork_size
 
    if (lwork .eq. -1) then
        work(1) = seedwork_size + seed_size
        return
    end if

    seedwork_offset = 1
    seed_offset = seedwork_offset + seedwork_size

    eps = PQRPARAM(3)

    ! check for border cases (only a 2x2 matrix left)
749
750
751
752
753
	if (idx .le. 1) then
		tau(1:2) = 0.0d0
		t(1:2,1:2) = 0.0d0
		return
	end if
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807

    call tum_pdlarfg2_1dcomm_seed(a,lda,work(seedwork_offset),lwork,work(seed_offset),m,mb,idx,rev,mpicomm)

        if (eps .gt. 0) then
            accurate = tum_pdlarfg2_1dcomm_check(work(seed_offset),eps)
        else
            accurate = .true.
        end if

        call tum_pdlarfg2_1dcomm_vector(a(1,2),1,tau(2),work(seed_offset), &
                                        m,mb,idx,0,1,mpicomm)

        call tum_pdlarfg_copy_1dcomm(a(1,2),1, &
                                     v(1,2),1, &
                                     m,baseidx,idx,mb,1,mpicomm)

        call tum_pdlarfg2_1dcomm_update(v(1,2),1,baseidx,a(1,1),lda,work(seed_offset),m,idx,mb,rev,mpicomm)

        ! check for 2x2 matrix case => only one householder vector will be
        ! generated
        if (idx .gt. 2) then
            if (accurate .eqv. .true.) then
                call tum_pdlarfg2_1dcomm_vector(a(1,1),1,tau(1),work(seed_offset), &
                                                m,mb,idx-1,1,1,mpicomm)

                call tum_pdlarfg_copy_1dcomm(a(1,1),1, &
                                             v(1,1),1, &
                                             m,baseidx,idx-1,mb,1,mpicomm)

                ! generate fuse element
                call tum_pdlarfg2_1dcomm_finalize_tmatrix(work(seed_offset),tau,t,ldt)

                actualk = 2
            else
                t(1,1) = 0.0d0
                t(1,2) = 0.0d0
                t(2,2) = tau(2)

                actualk = 1
            end if
        else
            t(1,1) = 0.0d0
            t(1,2) = 0.0d0
            t(2,2) = tau(2)

            ! no more vectors to create

            tau(1) = 0.0d0

            actualk = 2

            !print *,'rank2: no more data'
        end if

808
end subroutine tum_pdlarfg2_1dcomm_ref
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892

subroutine tum_pdlarfg2_1dcomm_seed(a,lda,work,lwork,seed,n,nb,idx,rev,mpicomm)
    use ELPA1
    use tum_utils

    implicit none

    ! input variables (local)
    integer lda,lwork
    double precision a(lda,*),work(*),seed(*)

    ! input variables (global)
    integer n,nb,idx,rev,mpicomm
 
    ! output variables (global)

    ! external functions
    double precision ddot
    external ddot

    ! local scalars
    double precision top11,top21,top12,top22
    double precision dot11,dot12,dot22
    integer mpirank,mpiprocs,mpierr
    integer mpirank_top11,mpirank_top21
    integer top11_offset,top21_offset
    integer baseoffset
    integer local_offset1,local_size1
    integer local_offset2,local_size2

    if (lwork .eq. -1) then
        work(1) = DBLE(8)
        return
    end if
  
    call MPI_Comm_rank(mpicomm, mpirank, mpierr)
    call MPI_Comm_size(mpicomm, mpiprocs, mpierr)

        call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
                              local_size1,baseoffset,local_offset1)

        call local_size_offset_1d(n,nb,idx,idx-2,rev,mpirank,mpiprocs, &
                              local_size2,baseoffset,local_offset2)

        mpirank_top11 = MOD((idx-1)/nb,mpiprocs)
        mpirank_top21 = MOD((idx-2)/nb,mpiprocs)

        top11_offset = local_index(idx,mpirank_top11,mpiprocs,nb,0)
        top21_offset = local_index(idx-1,mpirank_top21,mpiprocs,nb,0)

        if (mpirank_top11 .eq. mpirank) then
            top11 = a(top11_offset,2)
            top12 = a(top11_offset,1)
        else
            top11 = 0.0d0
            top12 = 0.0d0
        end if

        if (mpirank_top21 .eq. mpirank) then
            top21 = a(top21_offset,2)
            top22 = a(top21_offset,1)
        else
            top21 = 0.0d0
            top22 = 0.0d0
        end if

        ! calculate 3 dot products
        dot11 = ddot(local_size1,a(local_offset1,2),1,a(local_offset1,2),1)
        dot12 = ddot(local_size1,a(local_offset1,2),1,a(local_offset1,1),1)
        dot22 = ddot(local_size2,a(local_offset2,1),1,a(local_offset2,1),1)

    ! store results in work buffer
    work(1) = top11
    work(2) = dot11
    work(3) = top12
    work(4) = dot12
    work(5) = top21
    work(6) = top22
    work(7) = dot22
    work(8) = 0.0d0 ! fill up buffer

    ! exchange partial results
    call mpi_allreduce(work, seed, 8, mpi_real8, mpi_sum, &
                       mpicomm, mpierr)
893
end subroutine tum_pdlarfg2_1dcomm_seed
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932

logical function tum_pdlarfg2_1dcomm_check(seed,eps)
    implicit none

    ! input variables
    double precision seed(*)
    integer eps

    ! local scalars
    double precision epsd,first,second,first_second,estimate
    logical accurate
    double precision dot11,dot12,dot22
    double precision top11,top12,top21,top22
 
    EPSD = EPS
  
    top11 = seed(1)
    dot11 = seed(2)
    top12 = seed(3)
    dot12 = seed(4)
        
    top21 = seed(5)
    top22 = seed(6)
    dot22 = seed(7)

    ! reconstruct the whole inner products 
    ! (including squares of the top elements)
    first = dot11 + top11*top11
    second = dot22 + top22*top22 + top12*top12
    first_second = dot12 + top11*top12

    estimate = abs((first_second*first_second)/(first*second)) 

    !print *,'estimate:',estimate
    
    ! if accurate the following check holds
    accurate = (estimate .LE. (epsd/(1.0d0+epsd)))
  
    tum_pdlarfg2_1dcomm_check = accurate
933
end function tum_pdlarfg2_1dcomm_check
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963

! id=0: first vector
! id=1: second vector
subroutine tum_pdlarfg2_1dcomm_vector(x,incx,tau,seed,n,nb,idx,id,rev,mpicomm)
    use ELPA1
    use tum_utils
 
    implicit none

    ! input variables (local)
    integer lda,lwork,incx
    double precision x(*),seed(*),tau

    ! input variables (global)
    integer n,nb,idx,id,rev,mpicomm
 
    ! output variables (global)

    ! external functions
    double precision dlapy2
    external dlapy2,dscal

    ! local scalars
    integer mpirank,mpirank_top,mpiprocs,mpierr
    double precision alpha,dot,beta,xnorm
    integer local_size,baseoffset,local_offset,top,topidx

    call MPI_Comm_rank(mpicomm, mpirank, mpierr)
    call MPI_Comm_size(mpicomm, mpiprocs, mpierr)

964
    call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
                                  local_size,baseoffset,local_offset)

    local_offset = local_offset * incx

    ! Processor id for global index of top element
    mpirank_top = MOD((idx-1)/nb,mpiprocs)
    if (mpirank .eq. mpirank_top) then
        topidx = local_index(idx,mpirank_top,mpiprocs,nb,0)
        top = 1+(topidx-1)*incx
    end if

    alpha = seed(id*5+1)
    dot = seed(id*5+2)
    
    xnorm = sqrt(dot)

    if (xnorm .eq. 0.0d0) then
        ! H = I

        tau = 0.0d0
    else
        ! General case

        beta = sign(dlapy2(alpha, xnorm), alpha)
        tau = (beta+alpha) / beta

        !print *,'hg2',tau,xnorm,alpha
        
        call dscal(local_size, 1.0d0/(beta+alpha), &
                   x(local_offset), incx)
 
        ! TODO: reimplement norm rescale method of 
        ! original PDLARFG using mpi?

        if (mpirank .eq. mpirank_top) then
            x(top) = -beta