elpa2_template.X90 15 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
 !    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
 function elpa_solve_evp_&
  &MATH_DATATYPE&
  &_&
  &2stage_&
  &PRECISION&
57
  &_impl (obj, a, ev, q) result(success)
58

59
   use elpa_abstract_impl
60
   use elpa_utilities
61
62
63
64
65
66
67
   use elpa1_compute
   use elpa2_compute
   use elpa_mpi
   use cuda_functions
   use mod_check_for_gpu
   use iso_c_binding
   implicit none
68
   class(elpa_abstract_impl_t), intent(inout) :: obj
69
   logical                                   :: useGPU
70
#if REALCASE == 1
71
   logical                                   :: useQR
72
#endif
73
   logical                                   :: useQRActual
74
75
76

   integer(kind=c_int)                       :: bandwidth

77
   integer(kind=c_int)                       :: kernel
78
79

#ifdef USE_ASSUMED_SIZE
80
   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,*), q(obj%local_nrows,*)
81
#else
82
83
   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,obj%local_ncols), &
                                                         q(obj%local_nrows,obj%local_ncols)
84
#endif
85
   real(kind=C_DATATYPE_KIND), intent(inout) :: ev(obj%na)
86
87
88
89
90
91
92
93
94
95
96
97
98
99
   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable   :: hh_trans(:,:)

   integer(kind=c_int)                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer(kind=c_int)                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: tmat(:,:,:)
   real(kind=C_DATATYPE_KIND), allocatable   :: e(:)
#if COMPLEXCASE == 1
   real(kind=C_DATATYPE_KIND), allocatable   :: q_real(:,:)
#endif
   integer(kind=c_intptr_t)                  :: tmat_dev, q_dev, a_dev

   integer(kind=c_int)                       :: i
   logical                                   :: success, successCUDA
   logical                                   :: wantDebug
100
   integer(kind=c_int)                       :: istat, gpu, debug, qr
101
102
103
104
105
106
107
   character(200)                            :: errorMessage
   logical                                   :: do_useGPU, do_useGPU_trans_ev_tridi
   integer(kind=c_int)                       :: numberOfGPUDevices
   integer(kind=c_intptr_t), parameter       :: size_of_datatype = size_of_&
                                                                   &PRECISION&
                                                                   &_&
                                                                   &MATH_DATATYPE
108
109
    integer(kind=ik)                         :: na, nev, lda, ldq, nblk, matrixCols, &
                                                mpi_comm_rows, mpi_comm_cols, mpi_comm_all
110

111
    call obj%timer%start("elpa_solve_evp_&
112
    &MATH_DATATYPE&
113
114
115
    &_2stage_&
    &PRECISION&
    &")
116

117
118
119
120
121
122
123
124
    na         = obj%na
    nev        = obj%nev
    lda        = obj%local_nrows
    ldq        = obj%local_nrows
    nblk       = obj%nblk
    matrixCols = obj%local_ncols

#if REALCASE == 1
125
    call obj%get("real_kernel",kernel)
126
    ! check consistency between request for GPUs and defined kernel
127
128
    call obj%get("gpu", gpu)
    if (gpu == 1) then
129
130
      if (kernel .ne. ELPA_2STAGE_REAL_GPU) then
        write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
131
      else if (nblk .ne. 128) then
132
133
134
135
136
137
        kernel = ELPA_2STAGE_REAL_GENERIC
      endif
    endif
#endif

#if COMPLEXCASE == 1
138
    call obj%get("complex_kernel",kernel)
139
    ! check consistency between request for GPUs and defined kernel
140
141
    call obj%get("gpu", gpu)
    if (gpu == 1) then
142
143
      if (kernel .ne. ELPA_2STAGE_COMPLEX_GPU) then
        write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
144
      else if (nblk .ne. 128) then
145
146
147
148
        kernel = ELPA_2STAGE_COMPLEX_GENERIC
      endif
    endif
#endif
149
150
151
    call obj%get("mpi_comm_rows",mpi_comm_rows)
    call obj%get("mpi_comm_cols",mpi_comm_cols)
    call obj%get("mpi_comm_parent",mpi_comm_all)
152

153
154
    call obj%get("gpu",gpu)
    if (gpu .eq. 1) then
155
156
157
158
159
160
      useGPU = .true.
    else
      useGPU = .false.
    endif

#if REALCASE == 1
161
162
    call obj%get("qr",qr)
    if (qr .eq. 1) then
163
164
165
166
167
168
      useQR = .true.
    else
      useQR = .false.
    endif

#endif
169
    call obj%timer%start("mpi_communication")
170
171
172
173
174
175
176
    call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
    call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

    call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
    call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
    call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
    call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
177
    call obj%timer%stop("mpi_communication")
178

179
180
    call obj%get("debug",debug)
    wantDebug = debug == 1
181
182
183
184
185
186
187
188
189
    success = .true.

    do_useGPU      = .false.
    do_useGPU_trans_ev_tridi =.false.


#if REALCASE == 1
    useQRActual = .false.
    ! set usage of qr decomposition via API call
190
191
    if (useQR) useQRActual = .true.
    if (.not.(useQR)) useQRACtual = .false.
192
193
194
195
196
197
198
199
200
201
202
203
204

    if (useQRActual) then
      if (mod(na,2) .ne. 0) then
        if (wantDebug) then
          write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
        endif
        print *, "Do not use QR-decomposition for this matrix and blocksize."
        success = .false.
        return
      endif
    endif
#endif /* REALCASE */

205
206
    if (useGPU) then
      if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
207

208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
         do_useGPU = .true.

         ! set the neccessary parameters
         cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
         cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
         cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
         cudaHostRegisterPortable = cuda_hostRegisterPortable()
         cudaHostRegisterMapped   = cuda_hostRegisterMapped()
      else
        print *,"GPUs are requested but not detected! Aborting..."
        success = .false.
        return
      endif
    else
      ! check whether set by environment variable
223
224
      call obj%get("gpu",gpu)
      do_useGPU = gpu == 1
225
226
      if (do_useGPU) then
        if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
227
228
229
230
231
232
233
234
235
236
237
238
239

           ! set the neccessary parameters
           cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
           cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
           cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
           cudaHostRegisterPortable = cuda_hostRegisterPortable()
           cudaHostRegisterMapped   = cuda_hostRegisterMapped()
        else
          print *,"GPUs are requested but not detected! Aborting..."
          success = .false.
          return
        endif
      endif
240
    endif
241
242
243

    ! check consistency between request for GPUs and defined kernel
    if (do_useGPU) then
244
245
246
247
248
249
      if (nblk .ne. 128) then
        ! cannot run on GPU with this blocksize
        ! disable GPU usage for trans_ev_tridi
        do_useGPU_trans_ev_tridi = .false.
      else
        do_useGPU_trans_ev_tridi = .true.
250
251
      endif
    endif
252
    call obj%timer%start("bandred")
253

Andreas Marek's avatar
Andreas Marek committed
254
    if (obj%is_set("bandwidth") == 1) then
255
      call obj%get("bandwidth",nbw)
256
257
      if ((nbw == 0) .or. (mod(nbw, nblk) .ne. 0)) then
        if (wantDebug) then
Andreas Marek's avatar
Andreas Marek committed
258
          write(error_unit,*) "Specified bandwidth has to be a multiple of blocksize: ",nbw
259
260
261
262
263
        endif
        print *, "Specified bandwidth has to be a multiple of blocksize"
        success = .false.
        return
      endif
Andreas Marek's avatar
Andreas Marek committed
264
265

      !ttts = MPI_Wtime()
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
    else

      ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
      ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
      ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
      ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
      ! on this and maybe allow a run-time optimization here
      if (do_useGPU) then
        nbw = nblk
      else
#if REALCASE == 1
        nbw = (63/nblk+1)*nblk
#elif COMPLEXCASE == 1
        nbw = (31/nblk+1)*nblk
#endif
      endif

      num_blocks = (na-1)/nbw + 1

      allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"solve_evp_&
Andreas Marek's avatar
Retab    
Andreas Marek committed
288
289
290
291
  &MATH_DATATYPE&
  &_2stage_&
  &PRECISION&
  &" // ": error when allocating tmat "//errorMessage
292
293
294
295
296
297
298
299
        stop 1
      endif

      ! Reduction full -> band
      call bandred_&
      &MATH_DATATYPE&
      &_&
      &PRECISION &
300
      (obj, na, a, &
301
302
303
304
305
306
       a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, tmat, &
       tmat_dev,  wantDebug, do_useGPU, success &
#if REALCASE == 1
       , useQRActual &
#endif
       )
307

308
309
      if (.not.(success)) return
    end if  ! matrix not already banded on input
310
    call obj%timer%stop("bandred")
311
312
313
314
315
316
317
318
319
320
321
322

     ! Reduction band -> tridiagonal

     allocate(e(na), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_&
       &MATH_DATATYPE&
       &_2stage_&
       &PRECISION " // ": error when allocating e "//errorMessage
       stop 1
     endif

323
     call obj%timer%start("tridiag")
324
325
326
327
     call tridiag_band_&
     &MATH_DATATYPE&
     &_&
     &PRECISION&
328
     (obj, na, nbw, nblk, a, a_dev, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, do_useGPU)
329
330

#ifdef WITH_MPI
331
     call obj%timer%start("mpi_communication")
332
333
     call mpi_bcast(ev, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
     call mpi_bcast(e, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
334
     call obj%timer%stop("mpi_communication")
335
#endif /* WITH_MPI */
336
     call obj%timer%stop("tridiag")
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352

#if COMPLEXCASE == 1
     l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
     l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
     l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

     allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_&
       &MATH_DATATYPE&
       &_2stage: error when allocating q_real"//errorMessage
       stop 1
     endif
#endif

     ! Solve tridiagonal system
353
     call obj%timer%start("solve")
354
355
     call solve_tridi_&
     &PRECISION &
356
     (obj, na, nev, ev, e, &
357
358
359
360
361
362
363
#if REALCASE == 1
     q, ldq,   &
#endif
#if COMPLEXCASE == 1
     q_real, ubound(q_real,dim=1), &
#endif
     nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success)
364
     call obj%timer%stop("solve")
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
     if (.not.(success)) return

     deallocate(e, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_&
       &MATH_DATATYPE&
       &_2stage: error when deallocating e "//errorMessage
       stop 1
     endif

#if COMPLEXCASE == 1
     q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)

     deallocate(q_real, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_&
       &MATH_DATATYPE&
       &_2stage: error when deallocating q_real"//errorMessage
       stop 1
     endif
#endif
     ! Backtransform stage 1
387
388
     call obj%timer%start("trans_ev_to_band")

389
390
391
392
     call trans_ev_tridi_to_band_&
     &MATH_DATATYPE&
     &_&
     &PRECISION &
393
     (obj, na, nev, nblk, nbw, q, &
394
395
     q_dev, &
     ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi, &
396
     success=success, kernel=kernel)
397
     call obj%timer%stop("trans_ev_to_band")
398
399
400
401
402
403
404
405
406
407
408
409
410

     if (.not.(success)) return

     ! We can now deallocate the stored householder vectors
     deallocate(hh_trans, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *, "solve_evp_&
       &MATH_DATATYPE&
       &_2stage_&
       &PRECISION " // ": error when deallocating hh_trans "//errorMessage
       stop 1
     endif

411
     call obj%timer%start("trans_ev_to_full")
Andreas Marek's avatar
Andreas Marek committed
412
     if(obj%is_set("bandwidth") .ne. 1) then
413
414
415
416
417
418
419
420
       if ( (do_useGPU) .and. .not.(do_useGPU_trans_ev_tridi) ) then
         ! copy to device if we want to continue on GPU
         successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)

         successCUDA = cuda_memcpy(q_dev, loc(q), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
       endif

       ! Backtransform stage 2
421

422
423
424
425
       call trans_ev_band_to_full_&
       &MATH_DATATYPE&
       &_&
       &PRECISION &
426
       (obj, na, nev, nblk, nbw, a, &
427
428
429
430
431
432
433
        a_dev, lda, tmat, tmat_dev,  q,  &
        q_dev, &
        ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU &
#if REALCASE == 1
        , useQRActual  &
#endif
        )
434

435
436
437
438

       deallocate(tmat, stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"solve_evp_&
Andreas Marek's avatar
Retab    
Andreas Marek committed
439
440
441
   &MATH_DATATYPE&
   &_2stage_&
   &PRECISION " // ": error when deallocating tmat"//errorMessage
442
443
444
         stop 1
       endif
     endif
445
     call obj%timer%stop("trans_ev_to_full")
446

447
     call obj%timer%stop("elpa_solve_evp_&
448
     &MATH_DATATYPE&
449
450
451
     &_2stage_&
    &PRECISION&
    &")
452
453
454
455
456
457
1    format(a,f10.3)

   end function elpa_solve_evp_&
   &MATH_DATATYPE&
   &_2stage_&
   &PRECISION&
458
   &_impl
459
460

! vim: syntax=fortran