ev_tridi_band_gpu_c_v2_real_template.Xcu 15.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file was originally written by NVIDIA
// and re-written by A. Marek, MPCDF


#include <stdio.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#include "config-f90.h"

57
58
59
#define BLOCK_CYCLIC_BLOCKSIZE 128
#define GLOBAL_STRIPE_WIDTH 256

Andreas Marek's avatar
Andreas Marek committed
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
// Perform the equivalent of "__shfl_xor" on an 8-byte value
#ifdef DOUBLE_PRECISION_REAL
static __device__ __forceinline__ double shfl_xor_real_double(double r, int mask)
#else
static __device__ __forceinline__ float shfl_xor_real_single(float r, int mask)
#endif
{
    int hi = __shfl_xor(__double2hiint(r), mask);
    int lo = __shfl_xor(__double2loint(r), mask);

    return __hiloint2double(hi, lo);
}

// Perform the equivalent of "__shfl_down" on an 8-byte value
#ifdef DOUBLE_PRECISION_REAL
static __device__ __forceinline__ double shfl_down_real_double(double r, int offset)
#else
static __device__ __forceinline__ float shfl_down_real_single(float r, int offset)
#endif
{
    int hi = __shfl_down(__double2hiint(r), offset);
    int lo = __shfl_down(__double2loint(r), offset);

    return __hiloint2double(hi, lo);
}

// Perform a reduction on a warp or the first part of it
template <unsigned int REDUCE_START_OFFSET>
#ifdef DOUBLE_PRECISION_REAL
__device__ __forceinline__ double warp_reduce_real_double(double r)
#else
__device__ __forceinline__ float warp_reduce_real_single(float r)
#endif
{
#pragma unroll
    for (int i = REDUCE_START_OFFSET; i >= 1; i >>= 1)
    {
#ifdef DOUBLE_PRECISION_REAL
        r += shfl_down_real_double(r, i);
#else
        r += shfl_down_real_single(r, i);
#endif
    }

    return r;
}

// Perform 2 reductions, using either 1 or 2 warps
template <unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS>
#ifdef DOUBLE_PRECISION_REAL
__device__ __forceinline__ void double_warp_reduce_real_double(double * dotp_s, int w_off)
#else
__device__ __forceinline__ void float_warp_reduce_real_single(float * dotp_s, int w_off)
#endif
{
    int t_idx = threadIdx.x;

    if (HAVE_2_WARPS)
    {
        // In this case, we have 2 warps, each doing 1 reduction
        // attention
        if (t_idx < 64)
        {
#ifdef DOUBLE_PRECISION_REAL
            dotp_s[w_off + t_idx] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[w_off + t_idx] + dotp_s[w_off + t_idx + 32]);
#else
            dotp_s[w_off + t_idx] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[w_off + t_idx] + dotp_s[w_off + t_idx + 32]);
#endif
        }
    }
    else
    {
        // In this case we have 1 warp that performs both reductions
        // attention
        if (t_idx < 32)
        {
#ifdef DOUBLE_PRECISION_REAL
            dotp_s[t_idx] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
            dotp_s[t_idx + 64] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx + 64] + dotp_s[t_idx + 96]);
#else
            dotp_s[t_idx] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
            dotp_s[t_idx + 64] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx + 64] + dotp_s[t_idx + 96]);
#endif
        }
    }
}

147
148
// Reset the entire contents of a shared reduction block; the thread block size must be a power-of-2
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
149
__device__ __forceinline__ void reset_dotp_buffers_real_double(double * const __restrict__ s_block)
150
#else
Andreas Marek's avatar
Andreas Marek committed
151
__device__ __forceinline__ void reset_dotp_buffers_real_single(float * const __restrict__ s_block)
152
153
#endif
{
154
    // attention
155
156
157
158
159
160
161
162
163
164
165
    if (blockDim.x >= 64)
    {
        int t_idx = threadIdx.x;

        if (t_idx < 64)
        {
            s_block[t_idx] = s_block[t_idx + 64] = 0.0;
        }
    }
    else
    {
166
        int s_chunk = BLOCK_CYCLIC_BLOCKSIZE / blockDim.x;
167
168
169
170
171
172
173
174
175
176
#ifdef DOUBLE_PRECISION_REAL
        int s_chunk_size = s_chunk * sizeof(double);
#else
        int s_chunk_size = s_chunk * sizeof(float);
#endif
        // Each thread resets an equally-sized, contiguous portion of the buffer
        memset(s_block + threadIdx.x * s_chunk, 0, s_chunk_size);
    }
}

177
178
179
180
181
182
183
// =========================
// Backtransformation kernel
// =========================

// We use templates here to avoid additional branching based on the actual size of the thread-block
template<unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS>
#ifdef DOUBLE_PRECISION_REAL
184
__global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_kernel_real_double(double * const __restrict__ q, const double * const __restrict__ hh, const double * const __restrict__ hh_dot,
185
186
    const double * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
#else
187
__global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_kernel_real_single(float * const __restrict__ q, const float * const __restrict__ hh, const float * const __restrict__ hh_dot,
188
189
190
191
192
    const float * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
#endif

{
#ifdef DOUBLE_PRECISION_REAL
193
194
    __shared__ double dotp_s[BLOCK_CYCLIC_BLOCKSIZE];
    __shared__ double q_s[BLOCK_CYCLIC_BLOCKSIZE+1];
195
#else
196
197
    __shared__ float dotp_s[BLOCK_CYCLIC_BLOCKSIZE];
    __shared__ float q_s[BLOCK_CYCLIC_BLOCKSIZE+1];
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#endif

    int b_idx, t_idx, q_off, h_off, w_off, j, t_s, q_delta, hh_delta;
#ifdef DOUBLE_PRECISION_REAL
    double q_v_1, q_v_2, hh_v_1, hh_v_2, tau_1, tau_2, s_1, s_2, dot_p, hh_v_3, my_r1, my_r2;
#else
    float q_v_1, q_v_2, hh_v_1, hh_v_2, tau_1, tau_2, s_1, s_2, dot_p, hh_v_3, my_r1, my_r2;
#endif
    // The block index selects the eigenvector (EV) which the current block is responsible for
    b_idx = blockIdx.x;

    // The thread index selects the position inside the eigenvector selected above
    t_idx = threadIdx.x;

    // The warp offset for the current thread: 0 for the first warp, 32 for the second etc.
    w_off = (t_idx >> 5) << 5;

    // The entire contents of the shared reduction buffers must be reset

#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
218
   reset_dotp_buffers_real_double(dotp_s);
219
#else
Andreas Marek's avatar
Andreas Marek committed
220
    reset_dotp_buffers_real_single(dotp_s);
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#endif

    // Compute initial access indices
    j = off + ncols - 1;
    q_off = b_idx + (j + t_idx) * ldq;
    h_off = j * nb + t_idx;
    t_s = t_idx >> 1;
    q_delta = ldq << 1;
    hh_delta = nb << 1;

    // Load the last EV components in the EV cache
    if (t_idx > 0)
    {
        q_s[t_idx + 1] = q[q_off];
    }

    // Ensure the ring buffer and reduction buffers are initialized
Andreas Marek's avatar
Andreas Marek committed
238
    sync_real_threads<HAVE_2_WARPS>();
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271

    while (j >= off + 1)
    {
        // Per-iteration GMem I/O reads are in order to improve cache hit ratio

        // Read the corresponding compotents in the 2 Householder reflectors
        hh_v_1 = __ldg(&hh[h_off]);
        hh_v_2 = __ldg(&hh[h_off - nb]);
        hh_v_3 = (t_idx == 0)? 0.0 : __ldg(&hh[h_off - 1]);

        // Read the pre-computed dot-product of the 2 Householder reflectors
        dot_p = __ldg(&hh_dot[j - 1]);

        // Read the pre-computed values for "Tau" corresponding to the 2 Householder reflectors
        tau_1 = __ldg(&hh_tau[j]);
        tau_2 = __ldg(&hh_tau[j - 1]);

        // Only read the new EV components (the others are already stored in the shared EV cache, q_s)
        if (t_idx == 0)
        {
            q_s[0] = q[q_off - ldq];
            q_s[1] = q[q_off];
        }

        // Fill the shared buffers for the dot products bewtween the EV subset and the Householder reflectors
        q_v_1 = q_s[t_idx + 1];
        q_v_2 = q_s[t_idx];

        my_r1 = q_v_1 * hh_v_1 * tau_1;
        my_r2 = q_v_2 * hh_v_2 * tau_2;

        // After using "shfl_xor", both threads in a pair will hold the same values
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
272
273
        my_r1 += shfl_xor_real_double(my_r1, 1);
        my_r2 += shfl_xor_real_double(my_r2, 1);
274
#else
Andreas Marek's avatar
Andreas Marek committed
275
276
        my_r1 += shfl_xor_real_single(my_r1, 1);
        my_r2 += shfl_xor_real_single(my_r2, 1);
277
278
279
280
#endif

        // Now both threads in a pair can write to the same reduction buffer address without race-condition issues
        dotp_s[t_s] = my_r1;
281
	//attention
282
283
284
        dotp_s[t_s + 64] = my_r2;

        // Ensure the reduction buffers are fully populated
Andreas Marek's avatar
Andreas Marek committed
285
        sync_real_threads<HAVE_2_WARPS>();
286
287
288

        // Perform the 2 reductions using only the first warp (we assume the warp size is 32, valid up to CC 3.x)
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
289
        double_warp_reduce_real_double<REDUCE_START_OFFSET, HAVE_2_WARPS>(dotp_s, w_off);
290
#else
Andreas Marek's avatar
Andreas Marek committed
291
        float_warp_reduce_real_single<REDUCE_START_OFFSET, HAVE_2_WARPS>(dotp_s, w_off);
292
293
#endif
        // Ensure every thread will have access to the reduction results
Andreas Marek's avatar
Andreas Marek committed
294
        sync_real_threads<HAVE_2_WARPS>();
295
296
297

        // Each thread collects the reduction results
        s_1 = dotp_s[0];
298
299

	// attention
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
        s_2 = dotp_s[64];

        // Each thread updates its corresponding EV component
        q_v_2 = q_v_2 - hh_v_3 * s_1 - hh_v_2 * s_2 + tau_2 * hh_v_2 * s_1 * dot_p;

        if (t_idx == blockDim.x - 1)
        {
            // The last thread writes the last 2 EV components to the EV matrix
            q[q_off] = q_v_1 - hh_v_1 * s_1;
            q[q_off - ldq] = q_v_2;
        }
        else
        {
            // All other threads update the EV cache for the next iteration
            q_s[t_idx + 2] = q_v_2;
        }

Andreas Marek's avatar
Andreas Marek committed
317
        sync_real_threads<HAVE_2_WARPS>();
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351

        // Update access indices
        q_off -= q_delta;
        h_off -= hh_delta;
        j -= 2;
    }

    // Once the previous loop has finished, we have at most 1 more iteration to perform

    if (j == off - 1)
    {
        // No iterations remain, so the final contents of the EV matrix are updated
        if (t_idx < blockDim.x - 1)
        {
            q[q_off + ldq] = q_v_2;
        }
    }
    else
    {
        // One iteration remains; it must be processed separately
        if (t_idx == 0)
        {
            // Only one more EV element needs to be loaded
            q_s[1] = q[q_off];
        }

        // As before, we first read the EV and Householder components
        q_v_1 = q_s[t_idx + 1];
        hh_v_1 = __ldg(&hh[h_off]);
        tau_1 = __ldg(&hh_tau[j]);

        // We prepare the reduction buffer
        my_r1 = q_v_1 * hh_v_1 * tau_1;
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
352
        my_r1 += shfl_xor_real_double(my_r1, 1);
353
#else
Andreas Marek's avatar
Andreas Marek committed
354
        my_r1 += shfl_xor_real_single(my_r1, 1);
355
356
357
#endif
        dotp_s[t_s] = my_r1;

Andreas Marek's avatar
Andreas Marek committed
358
        sync_real_threads<HAVE_2_WARPS>();
359
360

        // We perform the reduction using the first warp only
361
	// attention
362
363
364
        if (t_idx < 32)
        {
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
365
            dotp_s[t_idx] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
366
#else
Andreas Marek's avatar
Andreas Marek committed
367
            dotp_s[t_idx] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
368
369
370
#endif
        }

Andreas Marek's avatar
Andreas Marek committed
371
        sync_real_threads<HAVE_2_WARPS>();
372
373
374
375
376
377
378
379

        // The last EV components are written to the EV matrix
        q[q_off] = q_v_1 - hh_v_1 * dotp_s[0];
    }
}

// This is a host wrapper for calling the appropriate back-transformation kernel, based on the SCALAPACK block size
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
380
 extern "C" void launch_compute_hh_trafo_c_kernel_real_double(double * const q, const double * const hh, const double * const hh_dot,  const double * const hh_tau, const int nev, const int nb, const int ldq, const int off, const int ncols)
381
#else
Andreas Marek's avatar
Andreas Marek committed
382
 extern "C" void launch_compute_hh_trafo_c_kernel_real_single(float * const q, const float * const hh, const float * const hh_dot,  const float * const hh_tau, const int nev, const int nb, const int ldq, const int off, const int ncols)
383
384
385
386
#endif
{
    switch (nb)
    {
387
        // attention
388
389
390
        case 128:
        case 64:
#ifdef DOUBLE_PRECISION_REAL
391
            compute_hh_trafo_kernel_real_double<16, true><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
392
#else
393
            compute_hh_trafo_kernel_real_single<16, true><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
394
395
396
397
398
#endif
            break;

        case 32:
#ifdef DOUBLE_PRECISION_REAL
399
            compute_hh_trafo_kernel_real_double<8, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
400
#else
401
            compute_hh_trafo_kernel_real_single<8, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
402
403
404
405
406
#endif
            break;

        case 16:
#ifdef DOUBLE_PRECISION_REAL
407
            compute_hh_trafo_kernel_real_double<4, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
408
#else
409
            compute_hh_trafo_kernel_real_single<4, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
410
411
412
413
414
#endif
            break;

        case 8:
#ifdef DOUBLE_PRECISION_REAL
415
            compute_hh_trafo_kernel_real_double<2, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
416
#else
417
            compute_hh_trafo_kernel_real_single<2, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
418
419
420
421
422
#endif
            break;

        case 4:
#ifdef DOUBLE_PRECISION_REAL
423
            compute_hh_trafo_kernel_real_double<1, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
424
#else
425
            compute_hh_trafo_kernel_real_single<1, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
426
427
428
429
430
431
#endif
            break;

        case 2:
        case 1:
#ifdef DOUBLE_PRECISION_REAL
432
            compute_hh_trafo_kernel_real_double<0, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
433
#else
434
            compute_hh_trafo_kernel_real_single<0, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
435
436
437
438
#endif
            break;

        default:
439
            printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and BLOCK_CYCLIC_BLOCKSIZE .\n");
440
441
442
    }
}