elpa2_template.X90 14 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
  function solve_evp_&
  &MATH_DATATYPE&
  &_&
  &2stage_&
  &PRECISION &
  (na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all,   &
#if REALCASE == 1
   THIS_ELPA_KERNEL_API, useQR,   &
#endif
#if COMPLEXCASE == 1
   THIS_ELPA_KERNEL_API,          &
12
#endif
13
14
   useGPU, bandwidth) result(success)

15
#ifdef HAVE_DETAILED_TIMINGS
Andreas Marek's avatar
Cleanup    
Andreas Marek committed
16
   use timings
17
#else
Andreas Marek's avatar
Cleanup    
Andreas Marek committed
18
   use timings_dummy
19
20
21
22
23
24
25
26
27
28
29
30
#endif
   use elpa1_utilities, only : gpu_usage_via_environment_variable
   use elpa1_compute
   use elpa2_compute
   use elpa_mpi
   use cuda_functions
   use mod_check_for_gpu
   use iso_c_binding
   implicit none
   logical, intent(in), optional             :: useGPU
#if REALCASE == 1
   logical, intent(in), optional             :: useQR
31
#endif
32
   logical                                   :: useQRActual, useQREnvironment
33
34
35

   integer(kind=c_int), intent(in), optional :: bandwidth

36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
   integer(kind=c_int), intent(in), optional :: THIS_ELPA_KERNEL_API
   integer(kind=c_int)                       :: THIS_ELPA_KERNEL

   integer(kind=c_int), intent(in)           :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
                                                mpi_comm_cols, mpi_comm_all
   integer(kind=c_int), intent(in)           :: nblk

#ifdef USE_ASSUMED_SIZE
   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(lda,*), q(ldq,*)
#else
   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(lda,matrixCols), q(ldq,matrixCols)
#endif
   real(kind=C_DATATYPE_KIND), intent(inout) :: ev(na)
   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable   :: hh_trans(:,:)

   integer(kind=c_int)                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer(kind=c_int)                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: tmat(:,:,:)
   real(kind=C_DATATYPE_KIND), allocatable   :: e(:)
#if COMPLEXCASE == 1
   real(kind=C_DATATYPE_KIND), allocatable   :: q_real(:,:)
#endif
   integer(kind=c_intptr_t)                  :: tmat_dev, q_dev, a_dev
   real(kind=c_double)                        :: ttt0, ttt1, ttts  ! MPI_WTIME always needs double

   integer(kind=c_int)                       :: i
   logical                                   :: success
   logical, save                             :: firstCall = .true.
   logical                                   :: wantDebug
   integer(kind=c_int)                       :: istat
   character(200)                            :: errorMessage
   logical                                   :: do_useGPU
   integer(kind=c_int)                       :: numberOfGPUDevices

70
71
    call timer%start("solve_evp_&
    &MATH_DATATYPE&
Andreas Marek's avatar
Cleanup    
Andreas Marek committed
72
    &_2stage" // &
73
74
    &PRECISION_SUFFIX &
    )
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

    call timer%start("mpi_communication")
    call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
    call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

    call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
    call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
    call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
    call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
    call timer%stop("mpi_communication")

    wantDebug = .false.
    if (firstCall) then
      ! are debug messages desired?
      wantDebug = debug_messages_via_environment_variable()
      firstCall = .false.
    endif

    success = .true.

    do_useGPU      = .false.

#if REALCASE == 1
    useQRActual = .false.
    ! set usage of qr decomposition via API call
    if (present(useQR)) then
      if (useQR) useQRActual = .true.
        if (.not.(useQR)) useQRACtual = .false.
    endif

    ! overwrite this with environment variable settings
    if (qr_decomposition_via_environment_variable(useQREnvironment)) then
      useQRActual = useQREnvironment
    endif

    if (useQRActual) then
      if (mod(na,2) .ne. 0) then
        if (wantDebug) then
          write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
        endif
        print *, "Do not use QR-decomposition for this matrix and blocksize."
        success = .false.
        return
      endif
    endif
120
#endif /* REALCASE */
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162

    if (present(useGPU)) then
      if (useGPU) then
        if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then

           do_useGPU = .true.
           ! set the neccessary parameters
           cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
           cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
           cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
           cudaHostRegisterPortable = cuda_hostRegisterPortable()
           cudaHostRegisterMapped   = cuda_hostRegisterMapped()
        else
          print *,"GPUs are requested but not detected! Aborting..."
          success = .false.
          return
        endif
      endif
    else
      ! check whether set by environment variable
      do_useGPU = gpu_usage_via_environment_variable()
   endif

    if (present(THIS_ELPA_KERNEL_API)) then
      ! user defined kernel via the optional argument in the API call
      THIS_ELPA_KERNEL = THIS_ELPA_KERNEL_API
    else

      ! if kernel is not choosen via api
      ! check whether set by environment variable
      THIS_ELPA_KERNEL = get_actual_&
            &MATH_DATATYPE&
            &_kernel()
    endif

    ! check whether choosen kernel is allowed: function returns true if NOT allowed! change this
    if (check_allowed_&
       &MATH_DATATYPE&
       &_kernels(THIS_ELPA_KERNEL)) then

      if (my_pe == 0) then
        write(error_unit,*) " "
163
164
165
        write(error_unit,*) "The choosen kernel ", &
	&MATH_DATATYPE&
	&_ELPA_KERNEL_NAMES(THIS_ELPA_KERNEL)
166
167
168
        write(error_unit,*) "is not in the list of the allowed kernels!"
        write(error_unit,*) " "
        write(error_unit,*) "Allowed kernels are:"
169
170
171
172
173
174
175
176
177
178
179
180
181
        do i=1,size( &
	&MATH_DATATYPE&
	&_ELPA_KERNEL_NAMES(:))
          if (AVAILABLE_&
	  &MATH_DATATYPE&
	  &_ELPA_KERNELS(i) .ne. 0) then
            write(error_unit,*) &
#if REALCASE == 1
	                        REAL_ELPA_KERNEL_NAMES(i)
#endif
#if COMPLEXCASE == 1
	                        COMPLEX_ELPA_KERNEL_NAMES(i)
#endif
182
183
184
185
186
          endif
        enddo

        write(error_unit,*) " "
        ! check whether generic kernel is defined
187
188
189
190
191
192
193
194
195
196
197
198
199
200
         if (AVAILABLE_&
	 &MATH_DATATYPE&
	 &_ELPA_KERNELS( &
#if REALCASE == 1
	                REAL_ELPA_KERNEL_GENERIC ) .eq. 1) then
#endif
#if COMPLEXCASE == 1
	                COMPLEX_ELPA_KERNEL_GENERIC ) .eq. 1) then

#endif
           write(error_unit,*) "The default kernel &
	   &MATH_DATATYPE&
	   &( &
	   &_ELPA_KERNEL_GENERIC will be used !"
201
         else
202
203
204
205
206
           write(error_unit,*) "As default kernel ", &
	   &MATH_DATATYPE&
	   &_ELPA_KERNEL_NAMES(DEFAULT_&
	   &MATH_DATATYPE&
	   &_ELPA_KERNEL)," will be used"
207
208
         endif
      endif  ! my_pe == 0
209
210
211
212
213
214
215
216
      if (AVAILABLE_&
      &MATH_DATATYPE&
      &_ELPA_KERNELS( &
      &MATH_DATATYPE&
      &_ELPA_KERNEL_GENERIC) .eq. 1) then
        THIS_ELPA_KERNEL = &
	&MATH_DATATYPE&
	&_ELPA_KERNEL_GENERIC
217
      else
218
219
220
        THIS_ELPA_KERNEL = DEFAULT_&
	&MATH_DATATYPE&
	&_ELPA_KERNEL
221
222
223
224
225
      endif
    endif

    ! check consistency between request for GPUs and defined kernel
    if (do_useGPU) then
226
227
228
      if (THIS_ELPA_KERNEL .ne. &
      &MATH_DATATYPE&
      &_ELPA_KERNEL_GPU) then
229
230
231
232
233
234
235
236
237
238
239
240
241
242
        write(error_unit,*) "GPU usage has been requested but compute kernel is defined as non-GPU! Aborting..."
        success = .false.
        return
      endif
    endif

    if (do_useGPU) then
      if (nblk .ne. 128) then
        write(error_unit,*) "In case of GPU usage the blocksize for ELPA 2stage has to be 128"
        success = .false.
        return
      endif
    endif

243
244
245
246
247
248
249
250
251
252
253
254
255
    if(present(bandwidth)) then
      nbw = bandwidth

      if ((nbw == 0) .or. (mod(nbw, nblk) .ne. 0)) then
        if (wantDebug) then
          write(error_unit,*) "Specified bandwidth has to be a multiple of blocksize"
        endif
        print *, "Specified bandwidth has to be a multiple of blocksize"
        success = .false.
        return
      endif

      ttts = MPI_Wtime()
256
    else
257
258
259
260
261
262
263
264
265

      ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
      ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
      ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
      ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
      ! on this and maybe allow a run-time optimization here
      if (do_useGPU) then
        nbw = nblk
      else
266
#if REALCASE == 1
267
        nbw = (63/nblk+1)*nblk
268
#elif COMPLEXCASE == 1
269
        nbw = (31/nblk+1)*nblk
270
#endif
271
      endif
272

273
      num_blocks = (na-1)/nbw + 1
274

275
276
      allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
277
278
        print *,"solve_evp_&
	&MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
279
280
281
	&_2stage_&
	&PRECISION&
	&" // ": error when allocating tmat "//errorMessage
282
283
        stop
      endif
284

285
      ! Reduction full -> band
286

287
288
      ttt0 = MPI_Wtime()
      ttts = ttt0
289
290
291
292
293
      call bandred_&
      &MATH_DATATYPE&
      &_&
      &PRECISION &
      (na, a, &
Andreas Marek's avatar
Andreas Marek committed
294
295
       a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, tmat, &
       tmat_dev,  wantDebug, do_useGPU, success &
296
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
297
       , useQRActual &
298
#endif
Andreas Marek's avatar
Andreas Marek committed
299
       )
300
301
302
      if (.not.(success)) return
      ttt1 = MPI_Wtime()
      if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
303
304
305
        write(error_unit,*) "Time " // "bandred_&
	&MATH_DATATYPE&
	&PRECISION " // "               :",ttt1-ttt0
306
307

    end if  ! matrix not already banded on input
308
309
310
311
312

     ! Reduction band -> tridiagonal

     allocate(e(na), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
313
314
315
316
       print *,"solve_evp_&
       &MATH_DATATYPE&
       &_2stage_&
       &PRECISION " // ": error when allocating e "//errorMessage
317
318
319
320
321
       stop
     endif

     ttt0 = MPI_Wtime()

322
323
324
325
326
     call tridiag_band_&
     &MATH_DATATYPE&
     &_&
     &PRECISION&
     (na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
327
328
329

     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
330
331
332
333
       write(error_unit,*) "Time " // "tridiag_band_&
       &MATH_DATATYPE&
       &_&
       &PRECISION " // "          :",ttt1-ttt0
334
335
336
337
338
339
340
341
342
343

#ifdef WITH_MPI
     call timer%start("mpi_communication")
     call mpi_bcast(ev, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
     call mpi_bcast(e, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
     call timer%stop("mpi_communication")
#endif /* WITH_MPI */
     ttt1 = MPI_Wtime()
     time_evp_fwd = ttt1-ttts

344
#if COMPLEXCASE == 1
345
346
347
348
349
350
     l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
     l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
     l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

     allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
351
352
353
       print *,"solve_evp_&
       &MATH_DATATYPE&
       &_2stage: error when allocating q_real"//errorMessage
354
355
       stop
     endif
356
357
#endif

358
359
360
     ! Solve tridiagonal system

     ttt0 = MPI_Wtime()
361
362
363
     call solve_tridi_&
     &PRECISION &
     (na, nev, ev, e, &
364
#if REALCASE == 1
365
     q, ldq,   &
366
#endif
367
368
369
370
371
#if COMPLEXCASE == 1
     q_real, ubound(q_real,dim=1), &
#endif
     nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success)

372
373
374
375
376
377
378
379
380
381
     if (.not.(success)) return

     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
     time_evp_solve = ttt1-ttt0
     ttts = ttt1

     deallocate(e, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
382
383
384
       print *,"solve_evp_&
       &MATH_DATATYPE&
       &_2stage: error when deallocating e "//errorMessage
385
386
       stop
     endif
387
388

#if COMPLEXCASE == 1
389
390
     q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)

391
     deallocate(q_real, stat=istat, errmsg=errorMessage)
392
     if (istat .ne. 0) then
393
394
395
       print *,"solve_evp_&
       &MATH_DATATYPE&
       &_2stage: error when deallocating q_real"//errorMessage
396
397
398
399
400
401
       stop
     endif
#endif
     ! Backtransform stage 1

     ttt0 = MPI_Wtime()
402
403
404
405
406
407

     call trans_ev_tridi_to_band_&
     &MATH_DATATYPE&
     &_&
     &PRECISION &
     (na, nev, nblk, nbw, q, &
408
#if REALCASE == 1
409
     q_dev, &
410
#endif
411
412
     ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU, &
     success, THIS_ELPA_KERNEL)
413
414
415
416

     if (.not.(success)) return
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
417
418
419
420
       write(error_unit,*) "Time " // "trans_ev_tridi_to_band_&
       &MATH_DATATYPE&
       &_&
       &PRECISION " // " :",ttt1-ttt0
421
422
423
424

     ! We can now deallocate the stored householder vectors
     deallocate(hh_trans, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
425
426
427
428
       print *, "solve_evp_&
       &MATH_DATATYPE&
       &_2stage_&
       &PRECISION " // ": error when deallocating hh_trans "//errorMessage
429
430
431
       stop
     endif

432
     if(present(bandwidth)) then
433
434
435
436
       time_evp_back = ttt1-ttts
     else
       ! Backtransform stage 2
       ttt0 = MPI_Wtime()
437
438
439
440
441
       call trans_ev_band_to_full_&
       &MATH_DATATYPE&
       &_&
       &PRECISION &
       (na, nev, nblk, nbw, a, &
Andreas Marek's avatar
Andreas Marek committed
442
        a_dev, lda, tmat, tmat_dev,  q,  &
443
444
#if REALCASE == 1
        q_dev, &
445
#endif
446
447
448
449
450
        ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU &
#if REALCASE == 1
        , useQRActual  &
#endif
        )
451

452
453
       ttt1 = MPI_Wtime()
       if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
454
455
456
457
         write(error_unit,*) "Time " // "trans_ev_band_to_full_&
	 &MATH_DATATYPE&
	 &_&
	 &PRECISION " // " :",ttt1-ttt0
458

459
460
461
462
       time_evp_back = ttt1-ttts

       deallocate(tmat, stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
463
464
465
466
         print *,"solve_evp_&
	 &MATH_DATATYPE&
	 _2stage_&
	 &PRECISION " // ": error when deallocating tmat"//errorMessage
467
468
469
         stop
       endif
     endif ! not present(bandwidth)
470

471
472
     call timer%stop("solve_evp_&
     &MATH_DATATYPE&
Andreas Marek's avatar
Cleanup    
Andreas Marek committed
473
     &_2stage" // &
474
475
     &PRECISION_SUFFIX &
     )
476
1    format(a,f10.3)
477
478
479
480
481

   end function solve_evp_&
   &MATH_DATATYPE&
   &_2stage_&
   &PRECISION