elpa2_compute_complex_template.X90 7.61 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
14
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

64
65
66
67
#define COMPLEXCASE 1
#undef REALCASE
#include "elpa2_bandred_template.X90"
#undef COMPLEXCASE
68
69
70
71
#include "elpa2_herm_matrix_allreduce_complex_template.X90"
#include "elpa2_trans_ev_band_to_full_complex_template.X90"
#include "elpa2_tridiag_band_complex_template.X90"
#include "elpa2_trans_ev_tridi_to_band_complex_template.X90"
72

73
    subroutine compute_hh_dot_products_complex_gpu_PRECISION(nbw, n)
74
75
76
77
78
79
      use cuda_c_kernel
      use precision
      implicit none
      integer(kind=ik), value :: nbw, n

      if (n .le. 1) return
80
      call launch_compute_hh_dotp_c_kernel_complex_PRECISION( bcast_buffer_dev, hh_dot_dev, nbw,n)
81
82
     end subroutine

83
     subroutine pack_row_group_complex_gpu_PRECISION(rows, n_offset, row_count)
84
85
86
87
88
89
90
91
92
       use cuda_c_kernel
       use precision
       implicit none
       integer(kind=ik), intent(in) :: n_offset, row_count
       complex(kind=COMPLEX_DATATYPE)             :: rows(:,:)
       integer(kind=ik)             :: max_idx
       logical                      :: successCUDA

       max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
93
       call launch_my_pack_c_kernel_complex_PRECISION(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
Andreas Marek's avatar
Andreas Marek committed
94
                                            l_nev, aIntern_dev, row_group_dev)
95
       successCUDA =  cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_PRECISION_complex, &
96
97
98
99
100
101
102
                                  cudaMemcpyDeviceToHost)
       if (.not.(successCUDA)) then
         print *,"pack_row_group_complex_gpu: error in cudaMemcpy"
         stop
       endif

     end subroutine
103
104

     subroutine unpack_row_group_complex_gpu_PRECISION(rows, n_offset, row_count)
105
106
107
108
109
110
111
112
113
114
       use cuda_c_kernel
       use precision
       implicit none
       integer(kind=ik), intent(in)    :: n_offset, row_count
       complex(kind=COMPLEX_DATATYPE), intent(in)    :: rows(:, :)
       integer(kind=ik)                :: max_idx
       integer(kind=ik)                :: i
       logical                         :: successCUDA

       max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
115
       successCUDA =  cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_PRECISION_complex , &
116
117
118
119
120
                                  cudaMemcpyHostToDevice)
       if (.not.(successCUDA)) then
         print *,"unpack_row_group_complex_gpu: error in cudaMemcpy"
         stop
       endif
121
       call launch_my_unpack_c_kernel_complex_PRECISION( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
Andreas Marek's avatar
Andreas Marek committed
122
                                              row_group_dev,aIntern_dev)
123
124
     end subroutine

125
     subroutine unpack_and_prepare_row_group_complex_gpu_PRECISION(next_unpack_idx, force)
126
127
128
129
130
131
132
133
134
135
136
137

       use precision
       implicit none
       integer(kind=ik), intent(in) :: next_unpack_idx
       logical, intent(in)          :: force

       if (row_group_size == 0) then
         ! Nothing to flush, just prepare for the upcoming row
         row_group_size = 1
       else
         if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /=next_unpack_idx)) then
           ! A flush and a reset must  performed
138
           call unpack_row_group_complex_gpu_PRECISION(row_group(:, :), unpack_idx - row_group_size, row_group_size)
139
140
141
142
143
144
145
146
147
148
149
           row_group_size = 1
         else
           ! Just prepare for the upcoming row
           row_group_size = row_group_size + 1
         endif
       endif
       ! Always update the index for the upcoming row
       unpack_idx = next_unpack_idx

    end subroutine

150
    subroutine compute_hh_trafo_complex_gpu_PRECISION(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167

      use iso_c_binding
      use cuda_c_kernel
      use precision
      implicit none
      integer(kind=ik), intent(in) :: off, ncols, istripe
      integer(kind=ik)             :: nl
      real(kind=c_double)          :: ttt ! MPI_WTIME always needs double

      integer(kind=ik)             :: a_off
      integer(kind=c_size_t)       :: dev_offset, dev_offset_1, dev_offset_2

      if (ncols < 1) return
      ttt = mpi_wtime()
      nl = merge(stripe_width, last_stripe_width, istripe < stripe_count)

      dev_offset = (0 + ( (  a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
168
169
170
                    size_of_PRECISION_complex
      dev_offset_1 = (0 +  (  off-1 )* nbw) *size_of_PRECISION_complex
      dev_offset_2 =( off-1 )*size_of_PRECISION_complex
171
172

!      t1_compute_kernel =MPI_Wtime()
173
      call launch_compute_hh_trafo_c_kernel_complex_PRECISION(aIntern_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
174
175
176
177
178
179
180
181
182
183
184
185
                                                    hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)

!      time0 = time0 + time1
!      t2_compute_kernel =MPI_Wtime()
!      t0_compute_kernel =  t0_compute_kernel + t2_compute_kernel-t1_compute_kernel

      kernel_flops = kernel_flops + 4 * int(nl, 8) * int(ncols, 8) * int(nbw,8)
      kernel_time = kernel_time + mpi_wtime() - ttt
      n_times =n_times +1
    end subroutine
end subroutine